tesseract  5.0.0
wordrec.h
Go to the documentation of this file.
1 // File: wordrec.h
3 // Description: wordrec class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_WORDREC_WORDREC_H_
20 #define TESSERACT_WORDREC_WORDREC_H_
21 
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE
24 #endif
25 
26 #ifdef DISABLED_LEGACY_ENGINE
27 
28 # include <cstdint> // for int16_t, int32_t
29 # include "classify.h" // for Classify
30 # include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
31 # include "ratngs.h" // for WERD_CHOICE
32 
33 namespace tesseract {
34 class TessdataManager;
35 }
36 
37 namespace tesseract {
38 
39 /* ccmain/tstruct.cpp */
40 
41 class TESS_API Wordrec : public Classify {
42 public:
43  // config parameters
44 
45  BOOL_VAR_H(wordrec_debug_blamer);
46  BOOL_VAR_H(wordrec_run_blamer);
47 
48  // methods
49  Wordrec();
50  virtual ~Wordrec() = default;
51 
52  // tface.cpp
53  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
54  TessdataManager *init_dict);
55  void program_editdown(int32_t elasped_time);
56  int end_recog();
57  int dict_word(const WERD_CHOICE &word);
58 
59  // Member variables
60  WERD_CHOICE *prev_word_best_choice_;
61 };
62 
63 } // namespace tesseract
64 
65 #else // DISABLED_LEGACY_ENGINE not defined
66 
67 # include <memory>
68 # include "associate.h"
69 # include "chop.h" // for PointHeap, MAX_NUM_POINTS
70 # include "classify.h" // for Classify
71 # include "dict.h"
72 # include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
73 # include "findseam.h" // for SeamQueue, SeamPile
74 # include "language_model.h"
75 # include "matrix.h"
76 # include "oldlist.h" // for LIST
77 # include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
78 # include "points.h" // for ICOORD
79 # include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...
80 # include "seam.h" // for SEAM (ptr only), PRIORITY
81 # include "stopper.h" // for DANGERR
82 
83 # include <cstdint> // for int16_t, int32_t
84 
85 namespace tesseract {
86 
87 class EDGEPT_CLIST;
88 class MATRIX;
89 class TBOX;
90 class UNICHARSET;
91 class WERD_RES;
92 
93 class LMPainPoints;
94 class TessdataManager;
95 struct BestChoiceBundle;
96 
97 struct BlamerBundle;
98 struct EDGEPT;
99 struct MATRIX_COORD;
100 struct SPLIT;
101 struct TBLOB;
102 struct TESSLINE;
103 struct TWERD;
104 
105 // A class for storing which nodes are to be processed by the segmentation
106 // search. There is a single SegSearchPending for each column in the ratings
107 // matrix, and it indicates whether the segsearch should combine all
108 // BLOB_CHOICES in the column, or just the given row with the parents
109 // corresponding to *this SegSearchPending, and whether only updated parent
110 // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
112 public:
114  : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}
115 
116  // Marks the whole column as just classified. Used to start a search on
117  // a newly initialized ratings matrix.
119  column_classified_ = true;
120  }
121  // Marks the matrix entry at the given row as just classified.
122  // Used after classifying a new matrix cell.
123  // Additional to, not overriding a previous RevisitWholeColumn.
124  void SetBlobClassified(int row) {
125  classified_row_ = row;
126  }
127  // Marks the whole column as needing work, but not just classified.
128  // Used when the parent vse list is updated.
129  // Additional to, not overriding a previous SetBlobClassified.
131  revisit_whole_column_ = true;
132  }
133 
134  // Clears *this to indicate no work to do.
135  void Clear() {
136  classified_row_ = -1;
137  revisit_whole_column_ = false;
138  column_classified_ = false;
139  }
140 
141  // Returns true if there are updates to do in the column that *this
142  // represents.
143  bool WorkToDo() const {
144  return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
145  }
146  // Returns true if the given row was just classified.
147  bool IsRowJustClassified(int row) const {
148  return row == classified_row_ || column_classified_;
149  }
150  // Returns the single row to process if there is only one, otherwise -1.
151  int SingleRow() const {
152  return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
153  }
154 
155 private:
156  // If non-negative, indicates the single row in the ratings matrix that has
157  // just been classified, and so should be combined with all the parents in the
158  // column that this SegSearchPending represents.
159  // Operates independently of revisit_whole_column.
160  int classified_row_;
161  // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
162  // be processed, but classified_row can indicate a row that is newly
163  // classified. Overridden if column_classified is true.
164  bool revisit_whole_column_;
165  // If column_classified is true, parent vses are processed with all rows
166  // regardless of whether they are just updated, overriding
167  // revisit_whole_column and classified_row.
168  bool column_classified_;
169 };
170 
171 /* ccmain/tstruct.cpp *********************************************************/
172 class FRAGMENT : public ELIST_LINK {
173 public:
174  FRAGMENT() { // constructor
175  }
176  FRAGMENT(EDGEPT *head_pt, // start
177  EDGEPT *tail_pt); // end
178 
179  ICOORD head; // coords of start
180  ICOORD tail; // coords of end
181  EDGEPT *headpt; // start point
182  EDGEPT *tailpt; // end point
183 };
185 
186 class TESS_API Wordrec : public Classify {
187 public:
188  // config parameters *******************************************************
189  BOOL_VAR_H(merge_fragments_in_matrix);
190  BOOL_VAR_H(wordrec_enable_assoc);
191  BOOL_VAR_H(force_word_assoc);
192  INT_VAR_H(repair_unchopped_blobs);
193  double_VAR_H(tessedit_certainty_threshold);
194  INT_VAR_H(chop_debug);
195  BOOL_VAR_H(chop_enable);
196  BOOL_VAR_H(chop_vertical_creep);
197  INT_VAR_H(chop_split_length);
198  INT_VAR_H(chop_same_distance);
199  INT_VAR_H(chop_min_outline_points);
200  INT_VAR_H(chop_seam_pile_size);
201  BOOL_VAR_H(chop_new_seam_pile);
202  INT_VAR_H(chop_inside_angle);
203  INT_VAR_H(chop_min_outline_area);
204  double_VAR_H(chop_split_dist_knob);
205  double_VAR_H(chop_overlap_knob);
206  double_VAR_H(chop_center_knob);
207  INT_VAR_H(chop_centered_maxwidth);
208  double_VAR_H(chop_sharpness_knob);
209  double_VAR_H(chop_width_change_knob);
210  double_VAR_H(chop_ok_split);
211  double_VAR_H(chop_good_split);
212  INT_VAR_H(chop_x_y_weight);
213  BOOL_VAR_H(assume_fixed_pitch_char_segment);
214  INT_VAR_H(wordrec_debug_level);
215  INT_VAR_H(wordrec_max_join_chunks);
216  BOOL_VAR_H(wordrec_skip_no_truth_words);
217  BOOL_VAR_H(wordrec_debug_blamer);
218  BOOL_VAR_H(wordrec_run_blamer);
219  INT_VAR_H(segsearch_debug_level);
220  INT_VAR_H(segsearch_max_pain_points);
221  INT_VAR_H(segsearch_max_futile_classifications);
222  double_VAR_H(segsearch_max_char_wh_ratio);
223  BOOL_VAR_H(save_alt_choices);
224 
225  // methods from wordrec/*.cpp ***********************************************
226  Wordrec();
227  ~Wordrec() override = default;
228 
229  // Fills word->alt_choices with alternative paths found during
230  // chopping/segmentation search that are kept in best_choices.
231  void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
232 
233  // Fills character choice lattice in the given BlamerBundle
234  // using the given ratings matrix and best choice list.
235  void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
236  const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
237 
238  // Calls fill_lattice_ member function
239  // (assumes that fill_lattice_ is not nullptr).
240  void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
241  const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
242  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
243  }
244 
245  // tface.cpp
246  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
247  TessdataManager *init_dict);
248  void cc_recog(WERD_RES *word);
249  void program_editdown(int32_t elasped_time);
250  void set_pass1();
251  void set_pass2();
252  int end_recog();
253  BLOB_CHOICE_LIST *call_matcher(TBLOB *blob);
254  int dict_word(const WERD_CHOICE &word);
255  // wordclass.cpp
256  BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,
257  BlamerBundle *blamer_bundle);
258 
259  // segsearch.cpp
260  // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
261  // Each entry in the matrix represents the classification choice
262  // for a chunk, i.e. an entry in row 2, column 1 represents the list
263  // of ratings for the chunks 1 and 2 classified as a single blob.
264  // The entries on the diagonal of the matrix are classifier choice lists
265  // for a single chunk from the maximal segmentation.
266  //
267  // The ratings matrix given to SegSearch represents the segmentation
268  // graph / trellis for the current word. The nodes in the graph are the
269  // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
270  // matrix. The children of each node (nodes connected by outgoing links)
271  // are the entries in the column that is equal to node's row+1. The parents
272  // (nodes connected by the incoming links) are the entries in the row that
273  // is equal to the node's column-1. Here is an example ratings matrix:
274  //
275  // 0 1 2 3 4
276  // -------------------------
277  // 0| c,( |
278  // 1| d l,1 |
279  // 2| o |
280  // 3| c,( |
281  // 4| g,y l,1 |
282  // -------------------------
283  //
284  // In the example above node "o" has children (outgoing connection to nodes)
285  // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
286  //
287  // The objective of the search is to find the least cost path, where the cost
288  // is determined by the language model components and the properties of the
289  // cut between the blobs on the path. SegSearch starts by populating the
290  // matrix with the all the entries that were classified by the chopper and
291  // finding the initial best path. Based on the classifier ratings, language
292  // model scores and the properties of each cut, a list of "pain points" is
293  // constructed - those are the points on the path where the choices do not
294  // look consistent with the neighboring choices, the cuts look particularly
295  // problematic, or the certainties of the blobs are low. The most troublesome
296  // "pain point" is picked from the list and the new entry in the ratings
297  // matrix corresponding to this "pain point" is filled in. Then the language
298  // model state is updated to reflect the new classification and the new
299  // "pain points" are added to the list and the next most troublesome
300  // "pain point" is determined. This continues until either the word choice
301  // composed from the best paths in the segmentation graph is "good enough"
302  // (e.g. above a certain certainty threshold, is an unambiguous dictionary
303  // word, etc) or there are no more "pain points" to explore.
304  //
305  // If associate_blobs is set to false no new classifications will be done
306  // to combine blobs. Segmentation search will run only one "iteration"
307  // on the classifications already recorded in chunks_record.ratings.
308  //
309  // Note: this function assumes that word_res, best_choice_bundle arguments
310  // are not nullptr.
311  void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
312  BlamerBundle *blamer_bundle);
313 
314  // Setup and run just the initial segsearch on an established matrix,
315  // without doing any additional chopping or joining.
316  // (Internal factored version that can be used as part of the main SegSearch.)
317  void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
318  std::vector<SegSearchPending> *pending,
319  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
320 
321  // chop.cpp
322  PRIORITY point_priority(EDGEPT *point);
323  void add_point_to_list(PointHeap *point_heap, EDGEPT *point);
324  // Returns true if the edgept supplied as input is an inside angle. This
325  // is determined by the angular change of the vectors from point to point.
326  bool is_inside_angle(EDGEPT *pt);
327  int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
328  EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist);
329  void prioritize_points(TESSLINE *outline, PointHeap *points);
330  void new_min_point(EDGEPT *local_min, PointHeap *points);
331  void new_max_point(EDGEPT *local_max, PointHeap *points);
332  void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point,
333  EDGEPT_CLIST *new_points);
334 
335  // chopper.cpp
336  SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
337  const std::vector<SEAM *> &seams);
338  SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
339  const std::vector<SEAM *> &seams);
340  SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
341  unsigned *blob_number);
342  SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
343  bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
344  unsigned *blob_number);
345  SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
346  const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
347  unsigned *blob_number);
348  void chop_word_main(WERD_RES *word);
349  void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
350  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
351  LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
352  int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
353  bool split_next_to_fragment);
354  int select_blob_to_split_from_fixpt(DANGERR *fixpt);
355 
356  // findseam.cpp
357  void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams);
358  void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,
359  SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile);
360  void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue);
361  SEAM *pick_good_seam(TBLOB *blob);
362  void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue,
363  SeamPile *seam_pile, SEAM **seam, TBLOB *blob);
364  void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,
365  EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile,
366  SEAM **seam, TBLOB *blob);
367 
368  // gradechop.cpp
369  PRIORITY grade_split_length(SPLIT *split);
370  PRIORITY grade_sharpness(SPLIT *split);
371 
372  // outlines.cpp
373  bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);
374 
375  // pieces.cpp
376  virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
377  int16_t end, const char *description, TWERD *word,
378  BlamerBundle *blamer_bundle);
379 
380  // Member variables.
381 
382  std::unique_ptr<LanguageModel> language_model_;
384  // Stores the best choice for the previous word in the paragraph.
385  // This variable is modified by PAGE_RES_IT when iterating over
386  // words to OCR on the page.
388 
389  // Function used to fill char choice lattices.
390  void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
391  const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
392 
393 protected:
394  inline bool SegSearchDone(int num_futile_classifications) {
395  return (language_model_->AcceptableChoiceFound() ||
396  num_futile_classifications >= segsearch_max_futile_classifications);
397  }
398 
399  // Updates the language model state recorded for the child entries specified
400  // in pending[starting_col]. Enqueues the children of the updated entries
401  // into pending and proceeds to update (and remove from pending) all the
402  // remaining entries in pending[col] (col >= starting_col). Upon termination
403  // of this function all the pending[col] lists will be empty.
404  //
405  // The arguments:
406  //
407  // starting_col: index of the column in chunks_record->ratings from
408  // which the update should be started
409  //
410  // pending: list of entries listing chunks_record->ratings entries
411  // that should be updated
412  //
413  // pain_points: priority heap listing the pain points generated by
414  // the language model
415  //
416  // temp_pain_points: temporary storage for tentative pain points generated
417  // by the language model after a single call to LanguageModel::UpdateState()
418  // (the argument is passed in rather than created before each
419  // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
420  //
421  // best_choice_bundle: a collection of variables that should be updated
422  // if a new best choice is found
423  //
424  void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
425  std::vector<SegSearchPending> *pending, WERD_RES *word_res,
426  LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
427  BlamerBundle *blamer_bundle);
428 
429  // Process the given pain point: classify the corresponding blob, enqueue
430  // new pain points to join the newly classified blob with its neighbors.
431  void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
432  const char *pain_point_type,
433  std::vector<SegSearchPending> *pending, WERD_RES *word_res,
434  LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
435  // Resets enough of the results so that the Viterbi search is re-run.
436  // Needed when the n-gram model is enabled, as the multi-length comparison
437  // implementation will re-value existing paths to worse values.
438  void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
439  std::vector<SegSearchPending> &pending);
440 
441  // Add pain points for classifying blobs on the correct segmentation path
442  // (so that we can evaluate correct segmentation path and discover the reason
443  // for incorrect result).
444  void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
445  BlamerBundle *blamer_bundle, std::string &blamer_debug);
446 };
447 
448 } // namespace tesseract
449 
450 #endif // DISABLED_LEGACY_ENGINE
451 
452 #endif // TESSERACT_WORDREC_WORDREC_H_
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
#define MAX_NUM_POINTS
Definition: chop.h:28
@ TBOX
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
float PRIORITY
Definition: seam.h:31
BOOL_VAR_H(wordrec_display_splits)
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
integer coordinate
Definition: points.h:36
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:226
void SetBlobClassified(int row)
Definition: wordrec.h:124
bool IsRowJustClassified(int row) const
Definition: wordrec.h:147
EDGEPT * headpt
Definition: wordrec.h:181
EDGEPT * tailpt
Definition: wordrec.h:182
FRAGMENT(EDGEPT *head_pt, EDGEPT *tail_pt)
BOOL_VAR_H(assume_fixed_pitch_char_segment)
INT_VAR_H(segsearch_debug_level)
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387
INT_VAR_H(wordrec_debug_level)
double_VAR_H(chop_width_change_knob)
BOOL_VAR_H(merge_fragments_in_matrix)
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:240
INT_VAR_H(segsearch_max_futile_classifications)
INT_VAR_H(chop_debug)
INT_VAR_H(chop_inside_angle)
INT_VAR_H(chop_same_distance)
BOOL_VAR_H(wordrec_skip_no_truth_words)
INT_VAR_H(chop_seam_pile_size)
void SaveAltChoices(const LIST &best_choices, WERD_RES *word)
BOOL_VAR_H(wordrec_debug_blamer)
double_VAR_H(chop_center_knob)
INT_VAR_H(chop_split_length)
void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
INT_VAR_H(repair_unchopped_blobs)
INT_VAR_H(chop_x_y_weight)
PRIORITY pass2_ok_split
Definition: wordrec.h:383
BOOL_VAR_H(chop_enable)
BOOL_VAR_H(wordrec_run_blamer)
BOOL_VAR_H(force_word_assoc)
double_VAR_H(chop_ok_split)
double_VAR_H(chop_sharpness_knob)
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:394
double_VAR_H(segsearch_max_char_wh_ratio)
INT_VAR_H(chop_min_outline_area)
double_VAR_H(chop_good_split)
INT_VAR_H(segsearch_max_pain_points)
BOOL_VAR_H(chop_vertical_creep)
BOOL_VAR_H(wordrec_enable_assoc)
double_VAR_H(tessedit_certainty_threshold)
~Wordrec() override=default
BOOL_VAR_H(save_alt_choices)
BOOL_VAR_H(chop_new_seam_pile)
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382
INT_VAR_H(chop_centered_maxwidth)
double_VAR_H(chop_overlap_knob)
double_VAR_H(chop_split_dist_knob)
INT_VAR_H(wordrec_max_join_chunks)
INT_VAR_H(chop_min_outline_points)
#define TESS_API
Definition: export.h:34