tesseract  5.0.0
pageres.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.h (Formerly page_res.h)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef PAGERES_H
20 #define PAGERES_H
21 
22 #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
23 #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
24 #include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
25 #include "genericvector.h" // for PointerVector
26 #include "matrix.h" // for MATRIX
27 #include "normalis.h" // for DENORM
28 #include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only)
29 #include "rect.h" // for TBOX
30 #include "rejctmap.h" // for REJMAP
31 #include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI...
32 #include "werd.h" // for WERD, W_BOL, W_EOL
33 
34 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
35 
36 #include <cstdint> // for int32_t, int16_t
37 #include <functional> // for std::function
38 #include <set> // for std::pair
39 #include <vector> // for std::vector
40 
41 #include <sys/types.h> // for int8_t
42 
43 struct Pix;
44 
45 namespace tesseract {
46 
47 class BLOCK;
48 class BLOCK_LIST;
49 class BLOCK_RES;
50 class ROW;
51 class ROW_RES;
52 class SEAM;
53 class WERD_RES;
54 
55 struct TWERD;
56 
57 class BoxWord;
58 class Tesseract;
59 struct FontInfo;
60 
61 /* Forward declarations */
62 
63 class BLOCK_RES;
64 
65 ELISTIZEH(BLOCK_RES)
66 CLISTIZEH(BLOCK_RES)
67 class ROW_RES;
68 
69 ELISTIZEH(ROW_RES)
70 class WERD_RES;
71 
72 ELISTIZEH(WERD_RES)
73 
74 /*************************************************************************
75  * PAGE_RES - Page results
76  *************************************************************************/
77 class PAGE_RES { // page result
78 public:
79  int32_t char_count;
80  int32_t rej_count;
81  BLOCK_RES_LIST block_res_list;
82  bool rejected;
83  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
84  // the next word. This pointer is not owned by PAGE_RES class.
86  // Sums of blame reasons computed by the blamer.
87  std::vector<int> blame_reasons;
88  // Debug information about all the misadaptions on this page.
89  // Each BlamerBundle contains an index into this vector, so that words that
90  // caused misadaption could be marked. However, since words could be
91  // deleted/split/merged, the log is stored on the PAGE_RES level.
92  std::vector<std::string> misadaption_log;
93 
94  inline void Init() {
95  char_count = 0;
96  rej_count = 0;
97  rejected = false;
98  prev_word_best_choice = nullptr;
99  blame_reasons.clear();
100  blame_reasons.resize(IRR_NUM_REASONS);
101  }
102 
104  Init();
105  } // empty constructor
106 
107  PAGE_RES(bool merge_similar_words,
108  BLOCK_LIST *block_list, // real blocks
109  WERD_CHOICE **prev_word_best_choice_ptr);
110 
111  ~PAGE_RES() = default;
112 };
113 
114 /*************************************************************************
115  * BLOCK_RES - Block results
116  *************************************************************************/
117 
118 class BLOCK_RES : public ELIST_LINK {
119 public:
120  BLOCK *block; // real block
121  int32_t char_count; // chars in block
122  int32_t rej_count; // rejected chars
123  int16_t font_class; //
124  int16_t row_count;
125  float x_height;
126  bool font_assigned; // block already
127  // processed
128 
129  ROW_RES_LIST row_res_list;
130 
131  BLOCK_RES() = default;
132 
133  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
134 
135  ~BLOCK_RES() = default;
136 };
137 
138 /*************************************************************************
139  * ROW_RES - Row results
140  *************************************************************************/
141 
142 class ROW_RES : public ELIST_LINK {
143 public:
144  ROW *row; // real row
145  int32_t char_count; // chars in block
146  int32_t rej_count; // rejected chars
147  int32_t whole_word_rej_count; // rejs in total rej wds
148  WERD_RES_LIST word_res_list;
149 
150  ROW_RES() = default;
151 
152  ROW_RES(bool merge_similar_words, ROW *the_row); // real row
153 
154  ~ROW_RES() = default;
155 };
156 
157 /*************************************************************************
158  * WERD_RES - Word results
159  *************************************************************************/
161 
162 // WERD_RES is a collection of publicly accessible members that gathers
163 // information about a word result.
164 class TESS_API WERD_RES : public ELIST_LINK {
165 public:
166  // Which word is which?
167  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
168  // the original image coordinate space, and the BLN space in which the
169  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
170  // and the x-middle of the word is at 0.
171  // In the rotated pixel space, coordinates correspond to the input image,
172  // but may be rotated about the origin by a multiple of 90 degrees,
173  // and may therefore be negative.
174  // In any case a rotation by denorm.block()->re_rotation() will take them
175  // back to the original image.
176  // The other differences between words all represent different stages of
177  // processing during recognition.
178 
179  // ---------------------------INPUT-------------------------------------
180 
181  // The word is the input C_BLOBs in the rotated pixel space.
182  // word is NOT owned by the WERD_RES unless combination is true.
183  // All the other word pointers ARE owned by the WERD_RES.
184  WERD *word = nullptr; // Input C_BLOB word.
185 
186  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
187 
188  // The bln_boxes contains the bounding boxes (only) of the input word, in the
189  // BLN space. The lengths of word and bln_boxes
190  // match as they are both before any chopping.
191  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
192  // if it doesn't.
193  tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes.
194  // The ROW that this word sits in. NOT owned by the WERD_RES.
195  ROW *blob_row = nullptr;
196  // The denorm provides the transformation to get back to the rotated image
197  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
198  // has its own denorm.
199  DENORM denorm; // For use on chopped_word.
200  // Unicharset used by the classifier output in best_choice and raw_choice.
201  const UNICHARSET *uch_set = nullptr; // For converting back to utf8.
202 
203  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
204  // ----Setup to a (different!) state expected by the various classifiers----
205  // TODO(rays) Tidy and make more consistent.
206 
207  // The chopped_word is also in BLN space, and represents the fully chopped
208  // character fragments that make up the word.
209  // The length of chopped_word matches length of seam_array + 1 (if set).
210  TWERD *chopped_word = nullptr; // BLN chopped fragments output.
211  // Vector of SEAM* holding chopping points matching chopped_word.
212  std::vector<SEAM *> seam_array;
213  // Widths of blobs in chopped_word.
214  std::vector<int> blob_widths;
215  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
216  // blob i and blob i+1.
217  std::vector<int> blob_gaps;
218  // Stores the lstm choices of every timestep
219  std::vector<std::vector<std::pair<const char *, float>>> timesteps;
220  // Stores the lstm choices of every timestep segmented by character
221  std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
223  // Symbolchoices acquired during CTC
224  std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices;
225  // Stores if the timestep vector starts with a space
226  bool leading_space = false;
227  // Stores value when the word ends
228  int end = 0;
229  // Ratings matrix contains classifier choices for each classified combination
230  // of blobs. The dimension is the same as the number of blobs in chopped_word
231  // and the leading diagonal corresponds to classifier results of the blobs
232  // in chopped_word. The state_ members of best_choice, raw_choice and
233  // best_choices all correspond to this ratings matrix and allow extraction
234  // of the blob choices for any given WERD_CHOICE.
235  MATRIX *ratings = nullptr; // Owned pointer.
236  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
237  // will be output from Tesseract. Note that this is now a borrowed pointer
238  // and should NOT be deleted.
239  WERD_CHOICE *best_choice = nullptr; // Borrowed pointer.
240  // The best raw_choice found during segmentation search. Differs from the
241  // best_choice by being the best result according to just the character
242  // classifier, not taking any language model information into account.
243  // Unlike best_choice, the pointer IS owned by this WERD_RES.
244  WERD_CHOICE *raw_choice = nullptr; // Owned pointer.
245  // Alternative results found during chopping/segmentation search stages.
246  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
247  WERD_CHOICE_LIST best_choices;
248 
249  // Truth bounding boxes, text and incorrect choice reason.
250  BlamerBundle *blamer_bundle = nullptr;
251 
252  // --------------OUTPUT FROM RECOGNITION-------------------------------
253  // --------------Not all fields are necessarily set.-------------------
254  // ---best_choice, raw_choice *must* end up set, with a box_word-------
255  // ---In complete output, the number of blobs in rebuild_word matches---
256  // ---the number of boxes in box_word, the number of unichar_ids in---
257  // ---best_choice, the number of ints in best_state, and the number---
258  // ---of strings in correct_text--------------------------------------
259  // ---SetupFake Sets everything to appropriate values if the word is---
260  // ---known to be bad before recognition.------------------------------
261 
262  // The rebuild_word is also in BLN space, but represents the final best
263  // segmentation of the word. Its length is therefore the same as box_word.
264  TWERD *rebuild_word = nullptr; // BLN best segmented word.
265  // The box_word is in the original image coordinate space. It is the
266  // bounding boxes of the rebuild_word, after denormalization.
267  // The length of box_word matches rebuild_word, best_state (if set) and
268  // correct_text (if set), as well as best_choice and represents the
269  // number of classified units in the output.
270  tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes.
271  // The Tesseract that was used to recognize this word. Just a borrowed
272  // pointer. Note: Tesseract's class definition is in a higher-level library.
273  // We avoid introducing a cyclic dependency by not using the Tesseract
274  // within WERD_RES. We are just storing it to provide access to it
275  // for the top-level multi-language controller, and maybe for output of
276  // the recognized language.
277  // tesseract points to data owned elsewhere.
279  // The best_state stores the relationship between chopped_word and
280  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
281  // adjacent blobs in chopped_word. The seams in seam_array are hidden
282  // within a rebuild_word blob and revealed between them.
283  std::vector<int> best_state; // Number of blobs in each best blob.
284  // The correct_text is used during training and adaption to carry the
285  // text to the training system without the need for a unicharset. There
286  // is one entry in the vector for each blob in rebuild_word and box_word.
287  std::vector<std::string> correct_text;
288 
289  // Less-well documented members.
290  // TODO(rays) Add more documentation here.
291  WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
292  REJMAP reject_map; // best_choice rejects
293  bool tess_failed = false;
294  /*
295  If tess_failed is true, one of the following tests failed when Tess
296  returned:
297  - The outword blob list was not the same length as the best_choice string;
298  - The best_choice string contained ALL blanks;
299  - The best_choice string was zero length
300 */
301  bool tess_accepted = false; // Tess thinks its ok?
302  bool tess_would_adapt = false; // Tess would adapt?
303  bool done = false; // ready for output?
304  bool small_caps = false; // word appears to be small caps
305  bool odd_size = false; // word is bigger than line or leader dots.
306  // The fontinfos are pointers to data owned by the classifier.
307  const FontInfo *fontinfo = nullptr;
308  const FontInfo *fontinfo2 = nullptr;
309  int8_t fontinfo_id_count = 0; // number of votes
310  int8_t fontinfo_id2_count = 0; // number of votes
311  bool guessed_x_ht = true;
312  bool guessed_caps_ht = true;
313  CRUNCH_MODE unlv_crunch_mode = CR_NONE;
314  float x_height = 0.0f; // post match estimate
315  float caps_height = 0.0f; // post match estimate
316  float baseline_shift = 0.0f; // post match estimate.
317  // Certainty score for the spaces either side of this word (LSTM mode).
318  // MIN this value with the actual word certainty.
319  float space_certainty = 0.0f;
320 
321  /*
322  To deal with fuzzy spaces we need to be able to combine "words" to form
323  combinations when we suspect that the gap is a non-space. The (new) text
324  ord code generates separate words for EVERY fuzzy gap - flags in the word
325  indicate whether the gap is below the threshold (fuzzy kern) and is thus
326  NOT a real word break by default, or above the threshold (fuzzy space) and
327  this is a real word break by default.
328 
329  The WERD_RES list contains all these words PLUS "combination" words built
330  out of (copies of) the words split by fuzzy kerns. The separate parts have
331  their "part_of_combo" flag set true and should be IGNORED on a default
332  reading of the list.
333 
334  Combination words are FOLLOWED by the sequence of part_of_combo words
335  which they combine.
336 */
337  bool combination = false; // of two fuzzy gap wds
338  bool part_of_combo = false; // part of a combo
339  bool reject_spaces = false; // Reject spacing?
340 
341  WERD_RES() = default;
342 
343  WERD_RES(WERD *the_word) {
344  word = the_word;
345  }
346  // Deep copies everything except the ratings MATRIX.
347  // To get that use deep_copy below.
348  WERD_RES(const WERD_RES &source) : ELIST_LINK(source) {
349  // combination is used in function Clear which is called from operator=.
350  combination = false;
351  *this = source; // see operator=
352  }
353 
354  ~WERD_RES();
355 
356  // Returns the UTF-8 string for the given blob index in the best_choice word,
357  // given that we know whether we are in a right-to-left reading context.
358  // This matters for mirrorable characters such as parentheses. We recognize
359  // characters purely based on their shape on the page, and by default produce
360  // the corresponding unicode for a left-to-right context.
361  const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const {
362  if (best_choice == nullptr || blob_index >= best_choice->length()) {
363  return nullptr;
364  }
365  UNICHAR_ID id = best_choice->unichar_id(blob_index);
366  if (static_cast<unsigned>(id) >= uch_set->size()) {
367  return nullptr;
368  }
369  UNICHAR_ID mirrored = uch_set->get_mirror(id);
370  if (in_rtl_context && mirrored > 0) {
371  id = mirrored;
372  }
373  return uch_set->id_to_unichar_ext(id);
374  }
375  // Returns the UTF-8 string for the given blob index in the raw_choice word.
376  const char *RawUTF8(unsigned blob_index) const {
377  if (blob_index >= raw_choice->length()) {
378  return nullptr;
379  }
380  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
381  if (static_cast<unsigned>(id) >= uch_set->size()) {
382  return nullptr;
383  }
384  return uch_set->id_to_unichar(id);
385  }
386 
387  UNICHARSET::Direction SymbolDirection(unsigned blob_index) const {
388  if (best_choice == nullptr || blob_index >= best_choice->length()) {
390  }
391  return uch_set->get_direction(best_choice->unichar_id(blob_index));
392  }
393 
394  bool AnyRtlCharsInWord() const {
395  if (uch_set == nullptr || best_choice == nullptr ||
396  best_choice->length() < 1) {
397  return false;
398  }
399  for (unsigned id = 0; id < best_choice->length(); id++) {
400  unsigned unichar_id = best_choice->unichar_id(id);
401  if (unichar_id >= uch_set->size()) {
402  continue; // Ignore illegal chars.
403  }
404  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
405  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
407  return true;
408  }
409  }
410  return false;
411  }
412 
413  bool AnyLtrCharsInWord() const {
414  if (uch_set == nullptr || best_choice == nullptr ||
415  best_choice->length() < 1) {
416  return false;
417  }
418  for (unsigned id = 0; id < best_choice->length(); id++) {
419  unsigned unichar_id = best_choice->unichar_id(id);
420  if (unichar_id >= uch_set->size()) {
421  continue; // Ignore illegal chars.
422  }
423  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
424  if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
426  return true;
427  }
428  }
429  return false;
430  }
431 
432  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
433  // that gave us the unichars in reading order (as opposed to strict left
434  // to right).
435  bool UnicharsInReadingOrder() const {
436  return best_choice->unichars_in_script_order();
437  }
438 
439  void Clear();
440  void ClearResults();
441  void ClearWordChoices();
442  void ClearRatings();
443 
444  // Deep copies everything except the ratings MATRIX.
445  // To get that use deep_copy below.
446  WERD_RES &operator=(const WERD_RES &source); // from this
447 
448  void CopySimpleFields(const WERD_RES &source);
449 
450  // Initializes a blank (default constructed) WERD_RES from one that has
451  // already been recognized.
452  // Use SetupFor*Recognition afterwards to complete the setup and make
453  // it ready for a retry recognition.
454  void InitForRetryRecognition(const WERD_RES &source);
455 
456  // Sets up the members used in recognition: bln_boxes, chopped_word,
457  // seam_array, denorm. Returns false if
458  // the word is empty and sets up fake results. If use_body_size is
459  // true and row->body_size is set, then body_size will be used for
460  // blob normalization instead of xheight + ascrise. This flag is for
461  // those languages that are using CJK pitch model and thus it has to
462  // be true if and only if tesseract->textord_use_cjk_fp_model is
463  // true.
464  // If allow_detailed_fx is true, the feature extractor will receive fine
465  // precision outline information, allowing smoother features and better
466  // features on low resolution images.
467  // The norm_mode sets the default mode for normalization in absence
468  // of any of the above flags. It should really be a tesseract::OcrEngineMode
469  // but is declared as int for ease of use with tessedit_ocr_engine_mode.
470  // Returns false if the word is empty and sets up fake results.
471  bool SetupForRecognition(const UNICHARSET &unicharset_in,
473  int norm_mode, const TBOX *norm_box,
474  bool numeric_mode, bool use_body_size,
475  bool allow_detailed_fx, ROW *row,
476  const BLOCK *block);
477 
478  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
479  // accumulators from a made chopped word. We presume the fields are already
480  // empty.
481  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
482 
483  // Sets up the members used in recognition for an empty recognition result:
484  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
485  void SetupFake(const UNICHARSET &uch);
486 
487  // Set the word as having the script of the input unicharset.
488  void SetupWordScript(const UNICHARSET &unicharset_in);
489 
490  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
491  void SetupBlamerBundle();
492 
493  // Computes the blob_widths and blob_gaps from the chopped_word.
494  void SetupBlobWidthsAndGaps();
495 
496  // Updates internal data to account for a new SEAM (chop) at the given
497  // blob_number. Fixes the ratings matrix and states in the choices, as well
498  // as the blob widths and gaps.
499  void InsertSeam(int blob_number, SEAM *seam);
500 
501  // Returns true if all the word choices except the first have adjust_factors
502  // worse than the given threshold.
503  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
504 
505  // Returns true if the current word is ambiguous (by number of answers or
506  // by dangerous ambigs.)
507  bool IsAmbiguous();
508 
509  // Returns true if the ratings matrix size matches the sum of each of the
510  // segmentation states.
511  bool StatesAllValid();
512 
513  // Prints a list of words found if debug is true or the word result matches
514  // the word_to_debug.
515  void DebugWordChoices(bool debug, const char *word_to_debug);
516 
517  // Prints the top choice along with the accepted/done flags.
518  void DebugTopChoice(const char *msg) const;
519 
520  // Removes from best_choices all choices which are not within a reasonable
521  // range of the best choice.
522  void FilterWordChoices(int debug_level);
523 
524  // Computes a set of distance thresholds used to control adaption.
525  // Compares the best choice for the current word to the best raw choice
526  // to determine which characters were classified incorrectly by the
527  // classifier. Then places a separate threshold into thresholds for each
528  // character in the word. If the classifier was correct, max_rating is placed
529  // into thresholds. If the classifier was incorrect, the mean match rating
530  // (error percentage) of the classifier's incorrect choice minus some margin
531  // is placed into thresholds. This can then be used by the caller to try to
532  // create a new template for the desired class that will classify the
533  // character with a rating better than the threshold value. The match rating
534  // placed into thresholds is never allowed to be below min_rating in order to
535  // prevent trying to make overly tight templates.
536  // min_rating limits how tight to make a template.
537  // max_rating limits how loose to make a template.
538  // rating_margin denotes the amount of margin to put in template.
539  void ComputeAdaptionThresholds(float certainty_scale, float min_rating,
540  float max_rating, float rating_margin,
541  float *thresholds);
542 
543  // Saves a copy of the word_choice if it has the best unadjusted rating.
544  // Returns true if the word_choice was the new best.
545  bool LogNewRawChoice(WERD_CHOICE *word_choice);
546  // Consumes word_choice by adding it to best_choices, (taking ownership) if
547  // the certainty for word_choice is some distance of the best choice in
548  // best_choices, or by deleting the word_choice and returning false.
549  // The best_choices list is kept in sorted order by rating. Duplicates are
550  // removed, and the list is kept no longer than max_num_choices in length.
551  // Returns true if the word_choice is still a valid pointer.
552  bool LogNewCookedChoice(int max_num_choices, bool debug,
553  WERD_CHOICE *word_choice);
554 
555  // Prints a brief list of all the best choices.
556  void PrintBestChoices() const;
557 
558  // Returns the sum of the widths of the blob between start_blob and last_blob
559  // inclusive.
560  int GetBlobsWidth(int start_blob, int last_blob) const;
561  // Returns the width of a gap between the specified blob and the next one.
562  int GetBlobsGap(unsigned blob_index) const;
563 
564  // Returns the BLOB_CHOICE corresponding to the given index in the
565  // best choice word taken from the appropriate cell in the ratings MATRIX.
566  // Borrowed pointer, so do not delete. May return nullptr if there is no
567  // BLOB_CHOICE matching the unichar_id at the given index.
568  BLOB_CHOICE *GetBlobChoice(unsigned index) const;
569 
570  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
571  // best choice word taken from the appropriate cell in the ratings MATRIX.
572  // Borrowed pointer, so do not delete.
573  BLOB_CHOICE_LIST *GetBlobChoices(int index) const;
574 
575  // Moves the results fields from word to this. This takes ownership of all
576  // the data, so src can be destructed.
577  // word1.ConsumeWordResult(word);
578  // delete word;
579  // is simpler and faster than:
580  // word1 = *word;
581  // delete word;
582  // as it doesn't need to copy and reallocate anything.
583  void ConsumeWordResults(WERD_RES *word);
584 
585  // Replace the best choice and rebuild box word.
586  // choice must be from the current best_choices list.
587  void ReplaceBestChoice(WERD_CHOICE *choice);
588 
589  // Builds the rebuild_word and sets the best_state from the chopped_word and
590  // the best_choice->state.
591  void RebuildBestState();
592 
593  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
594  // Also sets up the output box_word.
595  void CloneChoppedToRebuild();
596 
597  // Sets/replaces the box_word with one made from the rebuild_word.
598  void SetupBoxWord();
599 
600  // Sets up the script positions in the best_choice using the best_choice
601  // to get the unichars, and the unicharset to get the target positions.
602  void SetScriptPositions();
603  // Sets all the blobs in all the words (best choice and alternates) to be
604  // the given position. (When a sub/superscript is recognized as a separate
605  // word, it falls victim to the rule that a whole word cannot be sub or
606  // superscript, so this function overrides that problem.)
607  void SetAllScriptPositions(tesseract::ScriptPos position);
608 
609  // Classifies the word with some already-calculated BLOB_CHOICEs.
610  // The choices are an array of blob_count pointers to BLOB_CHOICE,
611  // providing a single classifier result for each blob.
612  // The BLOB_CHOICEs are consumed and the word takes ownership.
613  // The number of blobs in the box_word must match blob_count.
614  void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices);
615 
616  // Creates a WERD_CHOICE for the word using the top choices from the leading
617  // diagonal of the ratings matrix.
618  void FakeWordFromRatings(PermuterType permuter);
619 
620  // Copies the best_choice strings to the correct_text for adaption/training.
621  void BestChoiceToCorrectText();
622 
623  // Merges 2 adjacent blobs in the result if the permanent callback
624  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
625  // callback box_cb is nullptr or returns true, setting the merged blob
626  // result to the class returned from class_cb.
627  // Returns true if anything was merged.
628  bool ConditionalBlobMerge(
629  const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
630  const std::function<bool(const TBOX &, const TBOX &)> &box_cb);
631 
632  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
633  // all the data to account for the change.
634  void MergeAdjacentBlobs(unsigned index);
635 
636  // Callback helper for fix_quotes returns a double quote if both
637  // arguments are quote, otherwise INVALID_UNICHAR_ID.
638  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
639  void fix_quotes();
640 
641  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
642  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
643  UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
644  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
645  // (assuming both on the same textline, are in order and a chopped em dash.)
646  bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2);
647  void fix_hyphens();
648 
649  // Callback helper for merge_tess_fails returns a space if both
650  // arguments are space, otherwise INVALID_UNICHAR_ID.
651  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
652  void merge_tess_fails();
653 
654  // Returns a really deep copy of *src, including the ratings MATRIX.
655  static WERD_RES *deep_copy(const WERD_RES *src) {
656  auto *result = new WERD_RES(*src);
657  // That didn't copy the ratings, but we want a copy if there is one to
658  // begin with.
659  if (src->ratings != nullptr) {
660  result->ratings = src->ratings->DeepCopy();
661  }
662  return result;
663  }
664 
665  // Copy blobs from word_res onto this word (eliminating spaces between).
666  // Since this may be called bidirectionally OR both the BOL and EOL flags.
667  void copy_on(WERD_RES *word_res) { // from this word
668  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
669  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
670  word->copy_on(word_res->word);
671  }
672 
673  // Returns true if the collection of count pieces, starting at start, are all
674  // natural connected components, ie there are no real chops involved.
675  bool PiecesAllNatural(int start, int count) const;
676 };
677 
678 /*************************************************************************
679  * PAGE_RES_IT - Page results iterator
680  *************************************************************************/
681 
683 public:
684  PAGE_RES *page_res; // page being iterated
685 
686  PAGE_RES_IT() = default;
687 
688  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
689  page_res = the_page_res;
690  restart_page(); // ready to scan
691  }
692 
693  // Do two PAGE_RES_ITs point at the same word?
694  // This is much cheaper than cmp().
695  bool operator==(const PAGE_RES_IT &other) const {
696  return word_res == other.word_res && row_res == other.row_res &&
697  block_res == other.block_res;
698  }
699 
700  bool operator!=(const PAGE_RES_IT &other) const {
701  return !(*this == other);
702  }
703 
704  // Given another PAGE_RES_IT to the same page,
705  // this before other: -1
706  // this equal to other: 0
707  // this later than other: 1
708  int cmp(const PAGE_RES_IT &other) const;
709 
711  return start_page(false); // Skip empty blocks.
712  }
714  return start_page(true); // Allow empty blocks.
715  }
716  WERD_RES *start_page(bool empty_ok);
717 
718  WERD_RES *restart_row();
719 
720  // ============ Methods that mutate the underling structures ===========
721  // Note that these methods will potentially invalidate other PAGE_RES_ITs
722  // and are intended to be used only while a single PAGE_RES_IT is active.
723  // This problem needs to be taken into account if these mutation operators
724  // are ever provided to PageIterator or its subclasses.
725 
726  // Inserts the new_word and a corresponding WERD_RES before the current
727  // position. The simple fields of the WERD_RES are copied from clone_res and
728  // the resulting WERD_RES is returned for further setup with best_choice etc.
729  WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word);
730 
731  // Replaces the current WERD/WERD_RES with the given words. The given words
732  // contain fake blobs that indicate the position of the characters. These are
733  // replaced with real blobs from the current word as much as possible.
734  void ReplaceCurrentWord(PointerVector<WERD_RES> *words);
735 
736  // Deletes the current WERD_RES and its underlying WERD.
737  void DeleteCurrentWord();
738 
739  // Makes the current word a fuzzy space if not already fuzzy. Updates
740  // corresponding part of combo if required.
741  void MakeCurrentWordFuzzy();
742 
743  WERD_RES *forward() { // Get next word.
744  return internal_forward(false, false);
745  }
746  // Move forward, but allow empty blocks to show as single nullptr words.
748  return internal_forward(false, true);
749  }
750 
751  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
752  WERD_RES *forward_block(); // get first word in next non-empty block
753 
754  WERD_RES *prev_word() const { // previous word
755  return prev_word_res;
756  }
757  ROW_RES *prev_row() const { // row of prev word
758  return prev_row_res;
759  }
760  BLOCK_RES *prev_block() const { // block of prev word
761  return prev_block_res;
762  }
763  WERD_RES *word() const { // current word
764  return word_res;
765  }
766  ROW_RES *row() const { // row of current word
767  return row_res;
768  }
769  BLOCK_RES *block() const { // block of cur. word
770  return block_res;
771  }
772  WERD_RES *next_word() const { // next word
773  return next_word_res;
774  }
775  ROW_RES *next_row() const { // row of next word
776  return next_row_res;
777  }
778  BLOCK_RES *next_block() const { // block of next word
779  return next_block_res;
780  }
781  void rej_stat_word(); // for page/block/row
782  void ResetWordIterator();
783 
784 private:
785  WERD_RES *internal_forward(bool new_block, bool empty_ok);
786 
787  WERD_RES *prev_word_res; // previous word
788  ROW_RES *prev_row_res; // row of prev word
789  BLOCK_RES *prev_block_res; // block of prev word
790 
791  WERD_RES *word_res; // current word
792  ROW_RES *row_res; // row of current word
793  BLOCK_RES *block_res; // block of cur. word
794 
795  WERD_RES *next_word_res; // next word
796  ROW_RES *next_row_res; // row of next word
797  BLOCK_RES *next_block_res; // block of next word
798 
799  BLOCK_RES_IT block_res_it; // iterators
800  ROW_RES_IT row_res_it;
801  WERD_RES_IT word_res_it;
802  // Iterators used to get the state of word_res_it for the current word.
803  // Since word_res_it is 2 words further on, this is otherwise hard to do.
804  WERD_RES_IT wr_it_of_current_word;
805  WERD_RES_IT wr_it_of_next_word;
806 };
807 
808 } // namespace tesseract
809 
810 #endif
#define CLISTIZEH(CLASSNAME)
Definition: clst.h:705
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
@ IRR_NUM_REASONS
Definition: blamer.h:103
int UNICHAR_ID
Definition: unichar.h:36
PermuterType
Definition: ratngs.h:231
MATRIX * DeepCopy() const
Definition: matrix.cpp:97
int32_t rej_count
Definition: pageres.h:80
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
int32_t char_count
Definition: pageres.h:79
std::vector< std::string > misadaption_log
Definition: pageres.h:92
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:85
std::vector< int > blame_reasons
Definition: pageres.h:87
ROW_RES_LIST row_res_list
Definition: pageres.h:129
int16_t font_class
Definition: pageres.h:123
int32_t char_count
Definition: pageres.h:121
WERD_RES_LIST word_res_list
Definition: pageres.h:148
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
void copy_on(WERD_RES *word_res)
Definition: pageres.h:667
bool AnyRtlCharsInWord() const
Definition: pageres.h:394
std::vector< std::string > correct_text
Definition: pageres.h:287
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:219
WERD_RES(WERD *the_word)
Definition: pageres.h:343
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
Definition: pageres.h:361
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247
bool UnicharsInReadingOrder() const
Definition: pageres.h:435
MATRIX * ratings
Definition: pageres.h:235
std::vector< int > best_state
Definition: pageres.h:283
WERD_RES(const WERD_RES &source)
Definition: pageres.h:348
const char * RawUTF8(unsigned blob_index) const
Definition: pageres.h:376
std::vector< int > blob_widths
Definition: pageres.h:214
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:655
std::vector< int > blob_gaps
Definition: pageres.h:217
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:224
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const
Definition: pageres.h:387
std::vector< SEAM * > seam_array
Definition: pageres.h:212
bool AnyLtrCharsInWord() const
Definition: pageres.h:413
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:222
WERD_RES * next_word() const
Definition: pageres.h:772
PAGE_RES * page_res
Definition: pageres.h:684
BLOCK_RES * next_block() const
Definition: pageres.h:778
WERD_RES * restart_page()
Definition: pageres.h:710
WERD_RES * prev_word() const
Definition: pageres.h:754
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.h:695
WERD_RES * forward_with_empties()
Definition: pageres.h:747
BLOCK_RES * prev_block() const
Definition: pageres.h:760
bool operator!=(const PAGE_RES_IT &other) const
Definition: pageres.h:700
WERD_RES * restart_page_with_empties()
Definition: pageres.h:713
PAGE_RES_IT(PAGE_RES *the_page_res)
Definition: pageres.h:688
ROW_RES * prev_row() const
Definition: pageres.h:757
WERD_RES * forward()
Definition: pageres.h:743
ROW_RES * row() const
Definition: pageres.h:766
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
ROW_RES * next_row() const
Definition: pageres.h:775
bool unichars_in_script_order() const
Definition: ratngs.h:509
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
unsigned length() const
Definition: ratngs.h:283
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
void copy_on(WERD *other)
Definition: werd.cpp:230
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
size_t size() const
Definition: unicharset.h:355
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287
#define TESS_API
Definition: export.h:34