tesseract  5.0.0
pageiterator.h
Go to the documentation of this file.
1 // File: pageiterator.h
3 // Description: Iterator for tesseract page structure that avoids using
4 // tesseract internal data structures.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
21 #define TESSERACT_CCMAIN_PAGEITERATOR_H_
22 
23 #include "export.h"
24 #include "publictypes.h"
25 
26 struct Pix;
27 struct Pta;
28 
29 namespace tesseract {
30 
31 struct BlamerBundle;
32 class C_BLOB_IT;
33 class PAGE_RES;
34 class PAGE_RES_IT;
35 class WERD;
36 
37 class Tesseract;
38 
53 public:
68  PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
69  int scaled_yres, int rect_left, int rect_top, int rect_width,
70  int rect_height);
71  virtual ~PageIterator();
72 
79  PageIterator(const PageIterator &src);
80  const PageIterator &operator=(const PageIterator &src);
81 
83  bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
84 
85  // ============= Moving around within the page ============.
86 
91  virtual void Begin();
92 
98  virtual void RestartParagraph();
99 
104  bool IsWithinFirstTextlineOfParagraph() const;
105 
111  virtual void RestartRow();
112 
124  virtual bool Next(PageIteratorLevel level);
125 
139  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
140 
157  virtual bool IsAtFinalElement(PageIteratorLevel level,
158  PageIteratorLevel element) const;
159 
166  int Cmp(const PageIterator &other) const;
167 
168  // ============= Accessing data ==============.
169  // Coordinate system:
170  // Integer coordinates are at the cracks between the pixels.
171  // The top-left corner of the top-left pixel in the image is at (0,0).
172  // The bottom-right corner of the bottom-right pixel in the image is at
173  // (width, height).
174  // Every bounding box goes from the top-left of the top-left contained
175  // pixel to the bottom-right of the bottom-right contained pixel, so
176  // the bounding box of the single top-left pixel in the image is:
177  // (0,0)->(1,1).
178  // If an image rectangle has been set in the API, then returned coordinates
179  // relate to the original (full) image, rather than the rectangle.
180 
190  void SetBoundingBoxComponents(bool include_upper_dots,
191  bool include_lower_dots) {
192  include_upper_dots_ = include_upper_dots;
193  include_lower_dots_ = include_lower_dots;
194  }
195 
205  bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
206  int *bottom) const;
207  bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
208  int *right, int *bottom) const;
214  bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
215  int *right, int *bottom) const;
216 
218  bool Empty(PageIteratorLevel level) const;
219 
224  PolyBlockType BlockType() const;
225 
233  Pta *BlockPolygon() const;
234 
241  Pix *GetBinaryImage(PageIteratorLevel level) const;
242 
254  Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
255  int *left, int *top) const;
256 
263  bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
264  int *y2) const;
265 
266  // Returns the attributes of the current row.
267  void RowAttributes(float *row_height, float *descenders,
268  float *ascenders) const;
269 
278  void Orientation(tesseract::Orientation *orientation,
279  tesseract::WritingDirection *writing_direction,
280  tesseract::TextlineOrder *textline_order,
281  float *deskew_angle) const;
282 
311  void ParagraphInfo(tesseract::ParagraphJustification *justification,
312  bool *is_list_item, bool *is_crown,
313  int *first_line_indent) const;
314 
315  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
316  // of the current word to the given pointer (takes ownership of the pointer)
317  // and returns true.
318  // Can only be used when iterating on the word level.
319  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
320 
321 protected:
326  void BeginWord(int offset);
327 
351  C_BLOB_IT *cblob_it_;
356  int scale_;
362 };
363 
364 } // namespace tesseract.
365 
366 #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
ParagraphJustification
Definition: publictypes.h:248
void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots)
Definition: pageiterator.h:190
#define TESS_API
Definition: export.h:34