tesseract  5.0.0
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const std::string &lang, TessdataManager *data_file)
 
void LoadLSTM (const std::string &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
 STRING_VAR_H (user_words_file)
 
 STRING_VAR_H (user_words_suffix)
 
 STRING_VAR_H (user_patterns_file)
 
 STRING_VAR_H (user_patterns_suffix)
 
 BOOL_VAR_H (load_system_dawg)
 
 BOOL_VAR_H (load_freq_dawg)
 
 BOOL_VAR_H (load_unambig_dawg)
 
 BOOL_VAR_H (load_punc_dawg)
 
 BOOL_VAR_H (load_number_dawg)
 
 BOOL_VAR_H (load_bigram_dawg)
 
 double_VAR_H (xheight_penalty_subscripts)
 
 double_VAR_H (xheight_penalty_inconsistent)
 
 double_VAR_H (segment_penalty_dict_frequent_word)
 
 double_VAR_H (segment_penalty_dict_case_ok)
 
 double_VAR_H (segment_penalty_dict_case_bad)
 
 double_VAR_H (segment_penalty_dict_nonword)
 
 double_VAR_H (segment_penalty_garbage)
 
 STRING_VAR_H (output_ambig_words_file)
 
 INT_VAR_H (dawg_debug_level)
 
 INT_VAR_H (hyphen_debug_level)
 
 BOOL_VAR_H (use_only_first_uft8_step)
 
 double_VAR_H (certainty_scale)
 
 double_VAR_H (stopper_nondict_certainty_base)
 
 double_VAR_H (stopper_phase2_certainty_rejection_offset)
 
 INT_VAR_H (stopper_smallword_size)
 
 double_VAR_H (stopper_certainty_per_char)
 
 double_VAR_H (stopper_allowable_character_badness)
 
 INT_VAR_H (stopper_debug_level)
 
 BOOL_VAR_H (stopper_no_acceptable_choices)
 
 INT_VAR_H (tessedit_truncate_wordchoice_log)
 
 STRING_VAR_H (word_to_debug)
 
 BOOL_VAR_H (segment_nonalphabetic_script)
 
 BOOL_VAR_H (save_doc_words)
 
 double_VAR_H (doc_dict_pending_threshold)
 
 double_VAR_H (doc_dict_certainty_threshold)
 
 INT_VAR_H (max_permuter_attempts)
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 

Detailed Description

Definition at line 94 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 29 of file dict.cpp.

32  , ccutil_(ccutil)
33  , wildcard_unichar_id_(INVALID_UNICHAR_ID)
34  , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
35  , question_unichar_id_(INVALID_UNICHAR_ID)
36  , slash_unichar_id_(INVALID_UNICHAR_ID)
37  , hyphen_unichar_id_(INVALID_UNICHAR_ID)
38  , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
39  getCCUtil()->params())
40  , STRING_INIT_MEMBER(user_words_suffix, "",
41  "A suffix of user-provided words located in tessdata.",
42  getCCUtil()->params())
43  , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
44  getCCUtil()->params())
45  , STRING_INIT_MEMBER(user_patterns_suffix, "",
46  "A suffix of user-provided patterns located in "
47  "tessdata.",
48  getCCUtil()->params())
49  , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
50  , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
51  , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
52  getCCUtil()->params())
53  , BOOL_INIT_MEMBER(load_punc_dawg, true,
54  "Load dawg with punctuation"
55  " patterns.",
56  getCCUtil()->params())
57  , BOOL_INIT_MEMBER(load_number_dawg, true,
58  "Load dawg with number"
59  " patterns.",
60  getCCUtil()->params())
61  , BOOL_INIT_MEMBER(load_bigram_dawg, true,
62  "Load dawg with special word "
63  "bigrams.",
64  getCCUtil()->params())
65  , double_MEMBER(xheight_penalty_subscripts, 0.125,
66  "Score penalty (0.1 = 10%) added if there are subscripts "
67  "or superscripts in a word, but it is otherwise OK.",
68  getCCUtil()->params())
69  , double_MEMBER(xheight_penalty_inconsistent, 0.25,
70  "Score penalty (0.1 = 10%) added if an xheight is "
71  "inconsistent.",
72  getCCUtil()->params())
73  , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
74  "Score multiplier for word matches which have good case and"
75  " are frequent in the given language (lower is better).",
76  getCCUtil()->params())
77  , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
78  "Score multiplier for word matches that have good case "
79  "(lower is better).",
80  getCCUtil()->params())
81  , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
82  "Default score multiplier for word matches, which may have "
83  "case issues (lower is better).",
84  getCCUtil()->params())
85  , double_MEMBER(segment_penalty_dict_nonword, 1.25,
86  "Score multiplier for glyph fragment segmentations which "
87  "do not match a dictionary word (lower is better).",
88  getCCUtil()->params())
89  , double_MEMBER(segment_penalty_garbage, 1.50,
90  "Score multiplier for poorly cased strings that are not in"
91  " the dictionary and generally look like garbage (lower is"
92  " better).",
93  getCCUtil()->params())
94  , STRING_MEMBER(output_ambig_words_file, "",
95  "Output file for ambiguities found in the dictionary", getCCUtil()->params())
96  , INT_MEMBER(dawg_debug_level, 0,
97  "Set to 1 for general debug info"
98  ", to 2 for more details, to 3 to see all the debug messages",
99  getCCUtil()->params())
100  , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
101  , BOOL_MEMBER(use_only_first_uft8_step, false,
102  "Use only the first UTF8 step of the given string"
103  " when computing log probabilities.",
104  getCCUtil()->params())
105  , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
106  , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
107  getCCUtil()->params())
108  , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
109  getCCUtil()->params())
110  , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
111  getCCUtil()->params())
112  , double_MEMBER(stopper_certainty_per_char, -0.50,
113  "Certainty to add"
114  " for each dict char above small word size.",
115  getCCUtil()->params())
116  , double_MEMBER(stopper_allowable_character_badness, 3.0,
117  "Max certaintly variation allowed in a word (in sigma)", getCCUtil()->params())
118  , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
119  , BOOL_MEMBER(stopper_no_acceptable_choices, false,
120  "Make AcceptableChoice() always return false. Useful"
121  " when there is a need to explore all segmentations",
122  getCCUtil()->params())
123  , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
124  getCCUtil()->params())
125  , STRING_MEMBER(word_to_debug, "",
126  "Word for which stopper debug"
127  " information should be printed to stdout",
128  getCCUtil()->params())
129  , BOOL_MEMBER(segment_nonalphabetic_script, false,
130  "Don't use any alphabetic-specific tricks."
131  " Set to true in the traineddata config file for"
132  " scripts that are cursive or inherently fixed-pitch",
133  getCCUtil()->params())
134  , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
135  , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
136  getCCUtil()->params())
137  , double_MEMBER(doc_dict_certainty_threshold, -2.25,
138  "Worst certainty for words that can be inserted into the"
139  " document dictionary",
140  getCCUtil()->params())
141  , INT_MEMBER(max_permuter_attempts, 10000,
142  "Maximum number of different"
143  " character choices to consider during permutation."
144  " This limit is especially useful when user patterns"
145  " are specified, since overly generic patterns can result in"
146  " dawg search exploring an overly large number of options.",
147  getCCUtil()->params()) {
148  reject_offset_ = 0.0;
149  go_deeper_fxn_ = nullptr;
150  hyphen_word_ = nullptr;
151  last_word_on_line_ = false;
152  document_words_ = nullptr;
153  dawg_cache_ = nullptr;
154  dawg_cache_is_ours_ = false;
155  pending_words_ = nullptr;
156  bigram_dawg_ = nullptr;
157  freq_dawg_ = nullptr;
158  punc_dawg_ = nullptr;
159  unambig_dawg_ = nullptr;
160  wordseg_rating_adjust_factor_ = -1.0f;
161  output_ambig_words_file_ = nullptr;
162 }
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:368
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:380
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:378
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:374
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:372
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:370
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:210
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:364
const CCUtil * getCCUtil() const
Definition: dict.h:98
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:354
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:406

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 164 of file dict.cpp.

164  {
165  End();
166  delete hyphen_word_;
167  if (output_ambig_words_file_ != nullptr) {
168  fclose(output_ambig_words_file_);
169  }
170 }
void End()
Definition: dict.cpp:379

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 66 of file context.cpp.

66  {
67  if (word.length() < kMinAbsoluteGarbageWordLength) {
68  return false;
69  }
70  int num_alphanum = 0;
71  for (unsigned x = 0; x < word.length(); ++x) {
72  num_alphanum +=
73  (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74  }
75  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76  kMinAbsoluteGarbageAlphanumFrac);
77 }

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 42 of file stopper.cpp.

43  {
44  float CertaintyThreshold = stopper_nondict_certainty_base;
45  int WordSize;
46 
47  if (stopper_no_acceptable_choices) {
48  return false;
49  }
50 
51  if (best_choice.empty()) {
52  return false;
53  }
54 
55  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
56  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
57  bool is_case_ok = case_ok(best_choice);
58 
59  if (stopper_debug_level >= 1) {
60  const char *xht = "UNKNOWN";
61  switch (xheight_consistency) {
62  case XH_GOOD:
63  xht = "NORMAL";
64  break;
65  case XH_SUBNORMAL:
66  xht = "SUBNORMAL";
67  break;
68  case XH_INCONSISTENT:
69  xht = "INCONSISTENT";
70  break;
71  default:
72  xht = "UNKNOWN";
73  }
74  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
75  best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
76  (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
77  }
78  // Do not accept invalid words in PASS1.
79  if (reject_offset_ <= 0.0f && !is_valid_word) {
80  return false;
81  }
82  if (is_valid_word && is_case_ok) {
83  WordSize = LengthOfShortestAlphaRun(best_choice);
84  WordSize -= stopper_smallword_size;
85  if (WordSize < 0) {
86  WordSize = 0;
87  }
88  CertaintyThreshold += WordSize * stopper_certainty_per_char;
89  }
90 
91  if (stopper_debug_level >= 1) {
92  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
93  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
94  }
95 
96  if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
97  xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
98  return true;
99  } else {
100  if (stopper_debug_level >= 1) {
101  tprintf(
102  "AcceptableChoice() returned false"
103  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
104  no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
105  UniformCertainties(best_choice));
106  }
107  return false;
108  }
109 }
@ XH_GOOD
Definition: dict.h:81
@ XH_SUBNORMAL
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:464
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:443
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:45

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 111 of file stopper.cpp.

111  {
112  if (word->best_choice == nullptr) {
113  return false;
114  }
115  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
116  int WordSize;
117 
118  if (stopper_debug_level >= 1) {
119  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
120  word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
121  (case_ok(*word->best_choice) ? 'y' : 'n'),
122  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
123  word->best_choices.singleton() ? 'n' : 'y');
124  }
125 
126  if (word->best_choice->empty() || !word->best_choices.singleton()) {
127  return false;
128  }
129  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
130  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
131  WordSize -= stopper_smallword_size;
132  if (WordSize < 0) {
133  WordSize = 0;
134  }
135  CertaintyThreshold += WordSize * stopper_certainty_per_char;
136  }
137 
138  if (stopper_debug_level >= 1) {
139  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(),
140  CertaintyThreshold);
141  }
142 
143  if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
144  if (stopper_debug_level >= 1) {
145  tprintf("ACCEPTED\n");
146  }
147  return true;
148  } else {
149  if (stopper_debug_level >= 1) {
150  tprintf("REJECTED\n");
151  }
152  return false;
153  }
154 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:801

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 647 of file dict.cpp.

647  {
648  // Do not add hyphenated word parts to the document dawg.
649  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
650  // called when the first part of the hyphenated word is
651  // discovered and while the second part of the word is recognized.
652  // hyphen_word_ is cleared in cc_recg() before the next word on
653  // the line is recognized.
654  if (hyphen_word_) {
655  return;
656  }
657 
658  int stringlen = best_choice.length();
659 
660  if (valid_word(best_choice) || stringlen < 2) {
661  return;
662  }
663 
664  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
665  if (best_choice.length() >= kDocDictMaxRepChars) {
666  int num_rep_chars = 1;
667  UNICHAR_ID uch_id = best_choice.unichar_id(0);
668  for (unsigned i = 1; i < best_choice.length(); ++i) {
669  if (best_choice.unichar_id(i) != uch_id) {
670  num_rep_chars = 1;
671  uch_id = best_choice.unichar_id(i);
672  } else {
673  ++num_rep_chars;
674  if (num_rep_chars == kDocDictMaxRepChars) {
675  return;
676  }
677  }
678  }
679  }
680 
681  if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
682  if (best_choice.certainty() < doc_dict_pending_threshold) {
683  return;
684  }
685 
686  if (!pending_words_->word_in_dawg(best_choice)) {
687  if (stringlen > 2 ||
688  (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
689  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
690  pending_words_->add_word_to_dawg(best_choice);
691  }
692  return;
693  }
694  }
695 
696  if (save_doc_words) {
697  std::string filename(getCCUtil()->imagefile);
698  filename += ".doc";
699  FILE *doc_word_file = fopen(filename.c_str(), "a");
700  if (doc_word_file == nullptr) {
701  tprintf("Error: Could not open file %s\n", filename.c_str());
702  ASSERT_HOST(doc_word_file);
703  }
704  fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
705  fclose(doc_word_file);
706  }
707  document_words_->add_word_to_dawg(best_choice);
708 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
int UNICHAR_ID
Definition: unichar.h:36
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 710 of file dict.cpp.

711  {
712  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
713  word->GetTopScriptID() == getUnicharset().han_sid());
714  bool case_is_ok = (is_han || case_ok(*word));
715  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
716 
717  float adjust_factor = additional_adjust;
718  float new_rating = word->rating();
719  new_rating += kRatingPad;
720  const char *xheight_triggered = "";
721  if (word->length() > 1) {
722  // Calculate x-height and y-offset consistency penalties.
723  switch (xheight_consistency) {
724  case XH_INCONSISTENT:
725  adjust_factor += xheight_penalty_inconsistent;
726  xheight_triggered = ", xhtBAD";
727  break;
728  case XH_SUBNORMAL:
729  adjust_factor += xheight_penalty_subscripts;
730  xheight_triggered = ", xhtSUB";
731  break;
732  case XH_GOOD:
733  // leave the factor alone - all good!
734  break;
735  }
736  // TODO(eger): if nonword is true, but there is a "core" that is a dict
737  // word, negate nonword status.
738  } else {
739  if (debug) {
740  tprintf("Consistency could not be calculated.\n");
741  }
742  }
743  if (debug) {
744  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
745  word->rating(), xheight_triggered);
746  }
747 
748  if (nonword) { // non-dictionary word
749  if (case_is_ok && punc_is_ok) {
750  adjust_factor += segment_penalty_dict_nonword;
751  new_rating *= adjust_factor;
752  if (debug) {
753  tprintf(", W");
754  }
755  } else {
756  adjust_factor += segment_penalty_garbage;
757  new_rating *= adjust_factor;
758  if (debug) {
759  if (!case_is_ok) {
760  tprintf(", C");
761  }
762  if (!punc_is_ok) {
763  tprintf(", P");
764  }
765  }
766  }
767  } else { // dictionary word
768  if (case_is_ok) {
769  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
770  word->set_permuter(FREQ_DAWG_PERM);
771  adjust_factor += segment_penalty_dict_frequent_word;
772  new_rating *= adjust_factor;
773  if (debug) {
774  tprintf(", F");
775  }
776  } else {
777  adjust_factor += segment_penalty_dict_case_ok;
778  new_rating *= adjust_factor;
779  if (debug) {
780  tprintf(", ");
781  }
782  }
783  } else {
784  adjust_factor += segment_penalty_dict_case_bad;
785  new_rating *= adjust_factor;
786  if (debug) {
787  tprintf(", C");
788  }
789  }
790  }
791  new_rating -= kRatingPad;
792  if (modify_rating) {
793  word->set_rating(new_rating);
794  }
795  if (debug) {
796  tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
797  }
798  word->set_adjust_factor(adjust_factor);
799 }
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
int han_sid() const
Definition: unicharset.h:932
int null_sid() const
Definition: unicharset.h:917
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:883

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 224 of file permdawg.cpp.

228  {
229  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230 
231  // Deal with fragments.
232  CHAR_FRAGMENT_INFO char_frag_info;
233  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234  prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235  return; // blob_choice must be an invalid fragment
236  }
237  // Search the next letter if this character is a fragment.
238  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239  permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240  limit, best_choice, attempts_left, more_args);
241  return;
242  }
243 
244  // Add the next unichar.
245  float old_rating = word->rating();
246  float old_certainty = word->certainty();
247  uint8_t old_permuter = word->permuter();
248  certainties[word->length()] = char_frag_info.certainty;
249  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250  char_frag_info.rating, char_frag_info.certainty);
251 
252  // Explore the next unichar.
253  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254  word, certainties, limit, best_choice, attempts_left, more_args);
255 
256  // Remove the unichar we added to explore other choices in it's place.
257  word->remove_last_unichar_id();
258  word->set_rating(old_rating);
259  word->set_certainty(old_certainty);
260  word->set_permuter(old_permuter);
261 }
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:187
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:288

◆ BOOL_VAR_H() [1/10]

tesseract::Dict::BOOL_VAR_H ( load_bigram_dawg  )

◆ BOOL_VAR_H() [2/10]

tesseract::Dict::BOOL_VAR_H ( load_freq_dawg  )

◆ BOOL_VAR_H() [3/10]

tesseract::Dict::BOOL_VAR_H ( load_number_dawg  )

◆ BOOL_VAR_H() [4/10]

tesseract::Dict::BOOL_VAR_H ( load_punc_dawg  )

◆ BOOL_VAR_H() [5/10]

tesseract::Dict::BOOL_VAR_H ( load_system_dawg  )

◆ BOOL_VAR_H() [6/10]

tesseract::Dict::BOOL_VAR_H ( load_unambig_dawg  )

◆ BOOL_VAR_H() [7/10]

tesseract::Dict::BOOL_VAR_H ( save_doc_words  )

◆ BOOL_VAR_H() [8/10]

tesseract::Dict::BOOL_VAR_H ( segment_nonalphabetic_script  )

◆ BOOL_VAR_H() [9/10]

tesseract::Dict::BOOL_VAR_H ( stopper_no_acceptable_choices  )

◆ BOOL_VAR_H() [10/10]

tesseract::Dict::BOOL_VAR_H ( use_only_first_uft8_step  )

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word) const

Check a string to see if it matches a set of lexical rules.

Definition at line 45 of file context.cpp.

45  {
46  int state = 0;
47  const UNICHARSET *unicharset = word.unicharset();
48  for (unsigned x = 0; x < word.length(); ++x) {
49  UNICHAR_ID ch_id = word.unichar_id(x);
50  if (unicharset->get_isupper(ch_id)) {
51  state = case_state_table[state][1];
52  } else if (unicharset->get_islower(ch_id)) {
53  state = case_state_table[state][2];
54  } else if (unicharset->get_isdigit(ch_id)) {
55  state = case_state_table[state][3];
56  } else {
57  state = case_state_table[state][0];
58  }
59  if (state == -1) {
60  return false;
61  }
62  }
63  return state != 5; // single lower is bad
64 }
const int case_state_table[6][4]
Definition: context.cpp:28

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 411 of file dict.h.

411  {
412  if (!dawg) {
413  return ch;
414  }
415  switch (dawg->type()) {
416  case DAWG_TYPE_NUMBER:
417  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
418  default:
419  return ch;
420  }
421  }
@ DAWG_TYPE_NUMBER
Definition: dawg.h:67
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 116 of file dict.h.

116  {
117  const UNICHARSET &unicharset = getUnicharset();
118  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
119  const auto &normed_ids = unicharset.normed_ids(unichar_id);
120  return normed_ids.size() == 1 &&
121  (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
122  }

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 145 of file dict.h.

145  {
146  if (this->hyphenated()) {
147  *word = *hyphen_word_;
148  if (hyphen_debug_level) {
149  word->print("copy_hyphen_info: ");
150  }
151  }
152  }
void print() const
Definition: ratngs.h:557
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 159 of file permdawg.cpp.

160  {
161  auto *best_choice = new WERD_CHOICE(&getUnicharset());
162  best_choice->make_bad();
163  best_choice->set_rating(rating_limit);
164  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165  return best_choice;
166  }
167  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168  init_active_dawgs(&(active_dawgs[0]), true);
169  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
170  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
171 
172  float certainties[MAX_WERD_LENGTH];
174  int attempts_left = max_permuter_attempts;
175  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177  delete[] active_dawgs;
178  return best_choice;
179 }
#define MAX_WERD_LENGTH
Definition: dict.h:45
@ NO_PERM
Definition: ratngs.h:232
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:43
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable there from – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 406 of file dict.cpp.

407  {
408  auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
409 
410  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
411 
412  if (dawg_debug_level >= 3) {
413  tprintf(
414  "def_letter_is_okay: current unichar=%s word_end=%d"
415  " num active dawgs=%zu\n",
416  getUnicharset().debug_str(unichar_id).c_str(), word_end, dawg_args->active_dawgs->size());
417  }
418 
419  // Do not accept words that contain kPatternUnicharID.
420  // (otherwise pattern dawgs would not function correctly).
421  // Do not accept words containing INVALID_UNICHAR_IDs.
422  if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
423  dawg_args->permuter = NO_PERM;
424  return NO_PERM;
425  }
426 
427  // Initialization.
428  PermuterType curr_perm = NO_PERM;
429  dawg_args->updated_dawgs->clear();
430  dawg_args->valid_end = false;
431 
432  // Go over the active_dawgs vector and insert DawgPosition records
433  // with the updated ref (an edge with the corresponding unichar id) into
434  // dawg_args->updated_pos.
435  for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
436  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
437  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
438  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
439 
440  if (!dawg && !punc_dawg) {
441  // shouldn't happen.
442  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
443  continue;
444  }
445  if (!dawg) {
446  // We're in the punctuation dawg. A core dawg has not been chosen.
447  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
448  EDGE_REF punc_transition_edge =
449  punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
450  if (punc_transition_edge != NO_EDGE) {
451  // Find all successors, and see which can transition.
452  const SuccessorList &slist = *(successors_[pos.punc_index]);
453  for (int sdawg_index : slist) {
454  const Dawg *sdawg = dawgs_[sdawg_index];
455  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
456  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
457  if (dawg_edge != NO_EDGE) {
458  if (dawg_debug_level >= 3) {
459  tprintf("Letter found in dawg %d\n", sdawg_index);
460  }
461  dawg_args->updated_dawgs->add_unique(
462  DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
463  dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
464  if (sdawg->permuter() > curr_perm) {
465  curr_perm = sdawg->permuter();
466  }
467  if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
468  dawg_args->valid_end = true;
469  }
470  }
471  }
472  }
473  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
474  if (punc_edge != NO_EDGE) {
475  if (dawg_debug_level >= 3) {
476  tprintf("Letter found in punctuation dawg\n");
477  }
478  dawg_args->updated_dawgs->add_unique(
479  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
480  "Extend punctuation dawg: ");
481  if (PUNC_PERM > curr_perm) {
482  curr_perm = PUNC_PERM;
483  }
484  if (punc_dawg->end_of_word(punc_edge)) {
485  dawg_args->valid_end = true;
486  }
487  }
488  continue;
489  }
490 
491  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
492  // We can end the main word here.
493  // If we can continue on the punc ref, add that possibility.
494  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
495  EDGE_REF punc_edge =
496  punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
497  if (punc_edge != NO_EDGE) {
498  dawg_args->updated_dawgs->add_unique(
499  DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
500  dawg_debug_level > 0, "Return to punctuation dawg: ");
501  if (dawg->permuter() > curr_perm) {
502  curr_perm = dawg->permuter();
503  }
504  if (punc_dawg->end_of_word(punc_edge)) {
505  dawg_args->valid_end = true;
506  }
507  }
508  }
509 
510  if (pos.back_to_punc) {
511  continue;
512  }
513 
514  // If we are dealing with the pattern dawg, look up all the
515  // possible edges, not only for the exact unichar_id, but also
516  // for all its character classes (alpha, digit, etc).
517  if (dawg->type() == DAWG_TYPE_PATTERN) {
518  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
519  // There can't be any successors to dawg that is of type
520  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
521  continue;
522  }
523 
524  // Find the edge out of the node for the unichar_id.
525  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
526  EDGE_REF edge =
527  (node == NO_EDGE)
528  ? NO_EDGE
529  : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
530 
531  if (dawg_debug_level >= 3) {
532  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
533  }
534 
535  if (edge != NO_EDGE) { // the unichar was found in the current dawg
536  if (dawg_debug_level >= 3) {
537  tprintf("Letter found in dawg %d\n", pos.dawg_index);
538  }
539  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
540  if (dawg_debug_level >= 3) {
541  tprintf("Punctuation constraint not satisfied at end of word.\n");
542  }
543  continue;
544  }
545  if (dawg->permuter() > curr_perm) {
546  curr_perm = dawg->permuter();
547  }
548  if (dawg->end_of_word(edge) &&
549  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
550  dawg_args->valid_end = true;
551  }
552  dawg_args->updated_dawgs->add_unique(
553  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
554  dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
555  }
556  } // end for
557  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
558  // or if we found the current letter in a non-punctuation dawg. This
559  // allows preserving information on which dawg the "core" word came from.
560  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
561  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
562  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
563  dawg_args->permuter = curr_perm;
564  }
565  if (dawg_debug_level >= 2) {
566  tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
567  }
568  return dawg_args->permuter;
569 }
#define REFFORMAT
Definition: dawg.h:85
@ DAWG_TYPE_PATTERN
Definition: dawg.h:68
int64_t EDGE_REF
Definition: dawg.h:49
std::vector< int > SuccessorList
Definition: dawg.h:61
int64_t NODE_REF
Definition: dawg.h:50
PermuterType
Definition: ratngs.h:231
@ COMPOUND_PERM
Definition: ratngs.h:244
@ PUNC_PERM
Definition: ratngs.h:233
size_t size() const
Definition: unicharset.h:355
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:571
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 364 of file dict.h.

365  {
366  (void)lang;
367  (void)context;
368  (void)context_bytes;
369  (void)character;
370  (void)character_bytes;
371  return 0.0;
372  }
@ character
Definition: mfoutline.h:53

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 624 of file dict.cpp.

624  {
625  bool punc_dawg_available = (punc_dawg_ != nullptr) &&
626  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
627 
628  for (unsigned i = 0; i < dawgs_.size(); i++) {
629  if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
630  int dawg_ty = dawgs_[i]->type();
631  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
632  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
633  dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
634  if (dawg_debug_level >= 3) {
635  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
636  }
637  } else if (!punc_dawg_available || !subsumed_by_punc) {
638  dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
639  if (dawg_debug_level >= 3) {
640  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
641  }
642  }
643  }
644  }
645 }
@ DAWG_TYPE_PUNCTUATION
Definition: dawg.h:65
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.

◆ double_VAR_H() [1/14]

tesseract::Dict::double_VAR_H ( certainty_scale  )

◆ double_VAR_H() [2/14]

tesseract::Dict::double_VAR_H ( doc_dict_certainty_threshold  )

◆ double_VAR_H() [3/14]

tesseract::Dict::double_VAR_H ( doc_dict_pending_threshold  )

◆ double_VAR_H() [4/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_bad  )

◆ double_VAR_H() [5/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_ok  )

◆ double_VAR_H() [6/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_frequent_word  )

◆ double_VAR_H() [7/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_nonword  )

◆ double_VAR_H() [8/14]

tesseract::Dict::double_VAR_H ( segment_penalty_garbage  )

◆ double_VAR_H() [9/14]

tesseract::Dict::double_VAR_H ( stopper_allowable_character_badness  )

◆ double_VAR_H() [10/14]

tesseract::Dict::double_VAR_H ( stopper_certainty_per_char  )

◆ double_VAR_H() [11/14]

tesseract::Dict::double_VAR_H ( stopper_nondict_certainty_base  )

◆ double_VAR_H() [12/14]

tesseract::Dict::double_VAR_H ( stopper_phase2_certainty_rejection_offset  )

◆ double_VAR_H() [13/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_inconsistent  )

◆ double_VAR_H() [14/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_subscripts  )

◆ End()

void tesseract::Dict::End ( )

Definition at line 379 of file dict.cpp.

379  {
380  if (dawgs_.empty()) {
381  return; // Not safe to call twice.
382  }
383  for (auto &dawg : dawgs_) {
384  if (!dawg_cache_->FreeDawg(dawg)) {
385  delete dawg;
386  }
387  }
388  dawg_cache_->FreeDawg(bigram_dawg_);
389  if (dawg_cache_is_ours_) {
390  delete dawg_cache_;
391  dawg_cache_ = nullptr;
392  }
393  for (auto successor : successors_) {
394  delete successor;
395  }
396  dawgs_.clear();
397  successors_.clear();
398  document_words_ = nullptr;
399  delete pending_words_;
400  pending_words_ = nullptr;
401 }
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:37

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 358 of file stopper.cpp.

358 {}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 357 of file dict.cpp.

357  {
358  if (dawgs_.empty()) {
359  return false;
360  }
361  // Construct a list of corresponding successors for each dawg. Each entry, i,
362  // in the successors_ vector is a vector of integers that represent the
363  // indices into the dawgs_ vector of the successors for dawg i.
364  successors_.reserve(dawgs_.size());
365  for (auto dawg : dawgs_) {
366  auto *lst = new SuccessorList();
367  for (unsigned j = 0; j < dawgs_.size(); ++j) {
368  const Dawg *other = dawgs_[j];
369  if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
370  kDawgSuccessors[dawg->type()][other->type()]) {
371  lst->push_back(j);
372  }
373  }
374  successors_.push_back(lst);
375  }
376  return true;
377 }

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 288 of file permdawg.cpp.

290  {
291  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292  const CHAR_FRAGMENT *prev_fragment =
293  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294 
295  // Print debug info for fragments.
296  if (debug && (prev_fragment || this_fragment)) {
297  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298  getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299  if (prev_fragment) {
300  tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301  }
302  if (this_fragment) {
303  tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304  }
305  }
306 
307  char_frag_info->unichar_id = curr_unichar_id;
308  char_frag_info->fragment = this_fragment;
309  char_frag_info->rating = curr_rating;
310  char_frag_info->certainty = curr_certainty;
311  char_frag_info->num_fragments = 1;
312  if (prev_fragment && !this_fragment) {
313  if (debug) {
314  tprintf("Skip choice with incomplete fragment\n");
315  }
316  return false;
317  }
318  if (this_fragment) {
319  // We are dealing with a fragment.
320  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321  if (prev_fragment) {
322  if (!this_fragment->is_continuation_of(prev_fragment)) {
323  if (debug) {
324  tprintf("Non-matching fragment piece\n");
325  }
326  return false;
327  }
328  if (this_fragment->is_ending()) {
329  char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330  char_frag_info->fragment = nullptr;
331  if (debug) {
332  tprintf("Built character %s from fragments\n",
333  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334  }
335  } else {
336  if (debug) {
337  tprintf("Record fragment continuation\n");
338  }
339  char_frag_info->fragment = this_fragment;
340  }
341  // Update certainty and rating.
342  char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344  char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345  } else {
346  if (this_fragment->is_beginning()) {
347  if (debug) {
348  tprintf("Record fragment beginning\n");
349  }
350  } else {
351  if (debug) {
352  tprintf("Non-starting fragment piece with no prev_fragment\n");
353  }
354  return false;
355  }
356  }
357  }
358  if (word_ending && char_frag_info->fragment) {
359  if (debug) {
360  tprintf("Word can not end with a fragment\n");
361  }
362  return false;
363  }
364  return true;
365 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769

◆ getCCUtil() [1/2]

CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 101 of file dict.h.

101  {
102  return ccutil_;
103  }

◆ getCCUtil() [2/2]

const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 98 of file dict.h.

98  {
99  return ccutil_;
100  }

◆ GetDawg()

const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 385 of file dict.h.

385  {
386  return dawgs_[index];
387  }

◆ GetPuncDawg()

const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 389 of file dict.h.

389  {
390  return punc_dawg_;
391  }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 397 of file dict.h.

397  {
398  if (edge_ref == NO_EDGE) {
399  return 0; // beginning to explore the dawg
400  }
401  NODE_REF node = dawg->next_node(edge_ref);
402  if (node == 0) {
403  node = NO_EDGE; // end of word
404  }
405  return node;
406  }

◆ GetUnambigDawg()

const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 393 of file dict.h.

393  {
394  return unambig_dawg_;
395  }

◆ getUnicharAmbigs()

const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 111 of file dict.h.

111  {
112  return getCCUtil()->unichar_ambigs;
113  }
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63

◆ getUnicharset() [1/2]

UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 107 of file dict.h.

107  {
108  return getCCUtil()->unicharset;
109  }
UNICHARSET unicharset
Definition: ccutil.h:61

◆ getUnicharset() [2/2]

const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 104 of file dict.h.

104  {
105  return getCCUtil()->unicharset;
106  }

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 172 of file dict.cpp.

172  {
173  // This global cache (a singleton) will outlive every Tesseract instance
174  // (even those that someone else might declare as global statics).
175  static DawgCache cache;
176  return &cache;
177 }

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 43 of file permdawg.cpp.

47  {
48  auto *more_args = static_cast<DawgArgs *>(void_more_args);
49  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50  int word_index = word->length() - 1;
51  if (best_choice->rating() < *limit) {
52  return;
53  }
54  // Look up char in DAWG
55 
56  // If the current unichar is an ngram first try calling
57  // letter_is_okay() for each unigram it contains separately.
58  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59  bool checked_unigrams = false;
60  if (getUnicharset().get_isngram(orig_uch_id)) {
61  if (dawg_debug_level) {
62  tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63  }
64  int num_unigrams = 0;
65  word->remove_last_unichar_id();
66  std::vector<UNICHAR_ID> encoding;
67  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68  // Since the string came out of the unicharset, failure is impossible.
69  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70  bool unigrams_ok = true;
71  // Construct DawgArgs that reflect the current state.
72  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73  DawgPositionVector unigram_updated_dawgs;
74  DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75  // Check unigrams in the ngram with letter_is_okay().
76  for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77  UNICHAR_ID uch_id = encoding[i];
78  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79  ++num_unigrams;
80  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81  unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82  word->unichar_id(word_index + num_unigrams - 1),
83  word_ending && i == encoding.size() - 1);
84  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85  if (dawg_debug_level) {
86  tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87  unigrams_ok ? "OK" : "not OK");
88  }
89  }
90  // Restore the word and copy the updated dawg state if needed.
91  while (num_unigrams-- > 0) {
92  word->remove_last_unichar_id();
93  }
94  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95  if (unigrams_ok) {
96  checked_unigrams = true;
97  more_args->permuter = unigram_dawg_args.permuter;
98  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99  }
100  }
101 
102  // Check which dawgs from the dawgs_ vector contain the word
103  // up to and including the current unichar.
104  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105  word->unichar_id(word_index), word_ending)) {
106  // Add a new word choice
107  if (word_ending) {
108  if (dawg_debug_level) {
109  tprintf("found word = %s\n", word->debug_string().c_str());
110  }
111  if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112  if (output_ambig_words_file_ == nullptr) {
113  output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114  if (output_ambig_words_file_ == nullptr) {
115  tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116  exit(1);
117  }
118  std::string word_str;
119  word->string_and_lengths(&word_str, nullptr);
120  word_str += " ";
121  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122  }
123  std::string word_str;
124  word->string_and_lengths(&word_str, nullptr);
125  word_str += " ";
126  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127  }
128  WERD_CHOICE *adjusted_word = word;
129  adjusted_word->set_permuter(more_args->permuter);
130  update_best_choice(*adjusted_word, best_choice);
131  } else { // search the next letter
132  // Make updated_* point to the next entries in the DawgPositionVector
133  // arrays (that were originally created in dawg_permute_and_select)
134  ++(more_args->updated_dawgs);
135  // Make active_dawgs and constraints point to the updated ones.
136  ++(more_args->active_dawgs);
137  permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138  certainties, limit, best_choice, attempts_left, more_args);
139  // Restore previous state to explore another letter in this position.
140  --(more_args->updated_dawgs);
141  --(more_args->active_dawgs);
142  }
143  } else {
144  if (dawg_debug_level) {
145  tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146  }
147  }
148 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 154 of file dict.h.

155  {
156  if (!last_word_on_line_ || first_pos) {
157  return false;
158  }
159  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
160  const auto &normed_ids = unicharset->normed_ids(unichar_id);
161  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
162  }

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 164 of file dict.h.

164  {
165  int word_index = word.length() - 1;
166  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
167  }
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 139 of file dict.h.

139  {
140  return this->hyphenated() ? hyphen_word_->length() : 0;
141  }
unsigned length() const
Definition: ratngs.h:283

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 135 of file dict.h.

135  {
136  return !last_word_on_line_ && hyphen_word_;
137  }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 610 of file dict.cpp.

610  {
611  if (hyphenated()) {
612  *active_dawgs = hyphen_active_dawgs_;
613  if (dawg_debug_level >= 3) {
614  for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
615  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
616  hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
617  }
618  }
619  } else {
620  default_dawgs(active_dawgs, ambigs_mode);
621  }
622 }
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:624

◆ INT_VAR_H() [1/6]

tesseract::Dict::INT_VAR_H ( dawg_debug_level  )

◆ INT_VAR_H() [2/6]

tesseract::Dict::INT_VAR_H ( hyphen_debug_level  )

◆ INT_VAR_H() [3/6]

tesseract::Dict::INT_VAR_H ( max_permuter_attempts  )

◆ INT_VAR_H() [4/6]

tesseract::Dict::INT_VAR_H ( stopper_debug_level  )

◆ INT_VAR_H() [5/6]

tesseract::Dict::INT_VAR_H ( stopper_smallword_size  )

◆ INT_VAR_H() [6/6]

tesseract::Dict::INT_VAR_H ( tessedit_truncate_wordchoice_log  )

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 125 of file dict.h.

125  {
126  const UNICHARSET &unicharset = getUnicharset();
127  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
128  const auto &normed_ids = unicharset.normed_ids(unichar_id);
129  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130  }

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 912 of file dict.cpp.

912  {
913  const UNICHARSET &u_set = getUnicharset();
914  if (u_set.han_sid() > 0) {
915  return false;
916  }
917  if (u_set.katakana_sid() > 0) {
918  return false;
919  }
920  if (u_set.thai_sid() > 0) {
921  return false;
922  }
923  return true;
924 }

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 443 of file stopper.cpp.

443  {
444  int shortest = INT32_MAX;
445  int curr_len = 0;
446  for (unsigned w = 0; w < WordChoice.length(); ++w) {
447  if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
448  curr_len++;
449  } else if (curr_len > 0) {
450  if (curr_len < shortest) {
451  shortest = curr_len;
452  }
453  curr_len = 0;
454  }
455  }
456  if (curr_len > 0 && curr_len < shortest) {
457  shortest = curr_len;
458  } else if (shortest == INT32_MAX) {
459  shortest = 0;
460  }
461  return shortest;
462 }

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 348 of file dict.h.

349  {
350  return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
351  }

◆ Load()

void tesseract::Dict::Load ( const std::string &  lang,
TessdataManager data_file 
)

Definition at line 200 of file dict.cpp.

200  {
201  // Load dawgs_.
202  if (load_punc_dawg) {
203  punc_dawg_ =
204  dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
205  if (punc_dawg_) {
206  dawgs_.push_back(punc_dawg_);
207  }
208  }
209  if (load_system_dawg) {
210  Dawg *system_dawg =
211  dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
212  if (system_dawg) {
213  dawgs_.push_back(system_dawg);
214  }
215  }
216  if (load_number_dawg) {
217  Dawg *number_dawg =
218  dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
219  if (number_dawg) {
220  dawgs_.push_back(number_dawg);
221  }
222  }
223  if (load_bigram_dawg) {
224  bigram_dawg_ =
225  dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
226  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
227  // dawgs_!!
228  }
229  if (load_freq_dawg) {
230  freq_dawg_ =
231  dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
232  if (freq_dawg_) {
233  dawgs_.push_back(freq_dawg_);
234  }
235  }
236  if (load_unambig_dawg) {
237  unambig_dawg_ =
238  dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
239  if (unambig_dawg_) {
240  dawgs_.push_back(unambig_dawg_);
241  }
242  }
243 
244  std::string name;
245  if (!user_words_suffix.empty() || !user_words_file.empty()) {
246  Trie *trie_ptr =
247  new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
248  if (!user_words_file.empty()) {
249  name = user_words_file;
250  } else {
252  name += user_words_suffix;
253  }
254  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256  tprintf("Error: failed to load %s\n", name.c_str());
257  delete trie_ptr;
258  } else {
259  dawgs_.push_back(trie_ptr);
260  }
261  }
262 
263  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
265  dawg_debug_level);
266  trie_ptr->initialize_patterns(&(getUnicharset()));
267  if (!user_patterns_file.empty()) {
268  name = user_patterns_file;
269  } else {
271  name += user_patterns_suffix;
272  }
273  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
274  tprintf("Error: failed to load %s\n", name.c_str());
275  delete trie_ptr;
276  } else {
277  dawgs_.push_back(trie_ptr);
278  }
279  }
280 
281  document_words_ =
282  new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
283  dawgs_.push_back(document_words_);
284 
285  // This dawg is temporary and should not be searched by letter_is_ok.
286  pending_words_ =
287  new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
288 }
@ DAWG_TYPE_WORD
Definition: dawg.h:66
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_SYSTEM_DAWG
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ USER_PATTERN_PERM
Definition: ratngs.h:239
@ DOC_DAWG_PERM
Definition: ratngs.h:241
std::string language_data_path_prefix
Definition: ccutil.h:60
Dawg * GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:43
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const std::string &  lang,
TessdataManager data_file 
)

Definition at line 291 of file dict.cpp.

291  {
292  // Load dawgs_.
293  if (load_punc_dawg) {
294  punc_dawg_ =
295  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
296  if (punc_dawg_) {
297  dawgs_.push_back(punc_dawg_);
298  }
299  }
300  if (load_system_dawg) {
301  Dawg *system_dawg =
302  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
303  if (system_dawg) {
304  dawgs_.push_back(system_dawg);
305  }
306  }
307  if (load_number_dawg) {
308  Dawg *number_dawg =
309  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
310  if (number_dawg) {
311  dawgs_.push_back(number_dawg);
312  }
313  }
314 
315  // stolen from Dict::Load (but needs params_ from Tesseract
316  // langdata/config/api):
317  std::string name;
318  if (!user_words_suffix.empty() || !user_words_file.empty()) {
319  Trie *trie_ptr =
320  new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
321  if (!user_words_file.empty()) {
322  name = user_words_file;
323  } else {
325  name += user_words_suffix;
326  }
327  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
329  tprintf("Error: failed to load %s\n", name.c_str());
330  delete trie_ptr;
331  } else {
332  dawgs_.push_back(trie_ptr);
333  }
334  }
335 
336  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
337  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
338  dawg_debug_level);
339  trie_ptr->initialize_patterns(&(getUnicharset()));
340  if (!user_patterns_file.empty()) {
341  name = user_patterns_file;
342  } else {
344  name += user_patterns_suffix;
345  }
346  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
347  tprintf("Error: failed to load %s\n", name.c_str());
348  delete trie_ptr;
349  } else {
350  dawgs_.push_back(trie_ptr);
351  }
352  }
353 }
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 158 of file stopper.cpp.

159  {
160  if (stopper_debug_level > 2) {
161  tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
162  }
163 
164  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
165  // for each unichar id in BestChoice.
166  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
167  bool ambigs_found = false;
168  // For each position in best_choice:
169  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
170  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
171  // -- look for ambiguities corresponding to wrong_ngram in the list while
172  // adding the following unichar_ids from best_choice to wrong_ngram
173  //
174  // Repeat the above procedure twice: first time look through
175  // ambigs to be replaced and replace all the ambiguities found;
176  // second time look through dangerous ambiguities and construct
177  // ambig_blob_choices with fake a blob choice for each ambiguity
178  // and pass them to dawg_permute_and_select() to search for
179  // ambiguous words in the dictionaries.
180  //
181  // Note that during the execution of the for loop (on the first pass)
182  // if replacements are made the length of best_choice might change.
183  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
184  bool replace = (fix_replaceable && pass == 0);
185  const UnicharAmbigsVector &table =
187  if (!replace) {
188  // Initialize ambig_blob_choices with lists containing a single
189  // unichar id for the corresponding position in best_choice.
190  // best_choice consisting from only the original letters will
191  // have a rating of 0.0.
192  for (unsigned i = 0; i < best_choice->length(); ++i) {
193  auto *lst = new BLOB_CHOICE_LIST();
194  BLOB_CHOICE_IT lst_it(lst);
195  // TODO(rays/antonova) Put real xheights and y shifts here.
196  lst_it.add_to_end(
197  new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
198  ambig_blob_choices.push_back(lst);
199  }
200  }
201  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
202  int wrong_ngram_index;
203  int blob_index = 0;
204  for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
205  auto curr_unichar_id = best_choice->unichar_id(i);
206  if (stopper_debug_level > 2) {
207  tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
208  getUnicharset().debug_str(curr_unichar_id).c_str());
209  }
210  int num_wrong_blobs = best_choice->state(i);
211  wrong_ngram_index = 0;
212  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213  if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
214  table[curr_unichar_id] == nullptr) {
215  continue; // there is no ambig spec for this unichar id
216  }
217  AmbigSpec_IT spec_it(table[curr_unichar_id]);
218  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
219  const AmbigSpec *ambig_spec = spec_it.data();
220  wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
221  int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
222  if (stopper_debug_level > 2) {
223  tprintf("candidate ngram: ");
225  tprintf("current ngram from spec: ");
226  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
227  tprintf("comparison result: %d\n", compare);
228  }
229  if (compare == 0) {
230  // Record the place where we found an ambiguity.
231  if (fixpt != nullptr) {
232  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
233  fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
234  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
235  leftmost_id));
236  if (stopper_debug_level > 1) {
237  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
238  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
239  getUnicharset().id_to_unichar(leftmost_id));
240  }
241  }
242 
243  if (replace) {
244  if (stopper_debug_level > 2) {
245  tprintf("replace ambiguity with %s : ",
246  getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
247  UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
248  }
249  ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
250  ratings);
251  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
252  // We found dang ambig - update ambig_blob_choices.
253  if (stopper_debug_level > 2) {
254  tprintf("found ambiguity: ");
255  UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
256  }
257  ambigs_found = true;
258  for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
259  // Add a blob choice for the corresponding fragment of the
260  // ambiguity. These fake blob choices are initialized with
261  // negative ratings (which are not possible for real blob
262  // choices), so that dawg_permute_and_select() considers any
263  // word not consisting of only the original letters a better
264  // choice and stops searching for alternatives once such a
265  // choice is found.
266  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
267  bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
268  -1, 0, 1, 0, BCC_AMBIG));
269  }
270  }
271  spec_it.forward();
272  } else if (compare == -1) {
273  unsigned next_index;
274  if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
275  ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
276  // Add the next unichar id to wrong_ngram and keep looking for
277  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
278  wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
279  num_wrong_blobs += best_choice->state(next_index);
280  } else {
281  break; // no more matching ambigs in this AMBIG_SPEC_LIST
282  }
283  } else {
284  spec_it.forward();
285  }
286  } // end searching AmbigSpec_LIST
287  } // end searching best_choice
288  } // end searching replace and dangerous ambigs
289 
290  // If any ambiguities were found permute the constructed ambig_blob_choices
291  // to see if an alternative dictionary word can be found.
292  if (ambigs_found) {
293  if (stopper_debug_level > 2) {
294  tprintf("\nResulting ambig_blob_choices:\n");
295  for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
296  print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
297  tprintf("\n");
298  }
299  }
300  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
301  ambigs_found = (alt_word->rating() < 0.0);
302  if (ambigs_found) {
303  if (stopper_debug_level >= 1) {
304  tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
305  }
306  if (fixpt != nullptr) {
307  // Note: Currently character choices combined from fragments can only
308  // be generated by NoDangrousAmbigs(). This code should be updated if
309  // the capability to produce classifications combined from character
310  // fragments is added to other functions.
311  int orig_i = 0;
312  for (unsigned i = 0; i < alt_word->length(); ++i) {
313  const UNICHARSET &uchset = getUnicharset();
314  bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
315  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
316  if (replacement_is_ngram) {
317  // we have to extract the leftmost unichar from the ngram.
318  const char *str = uchset.id_to_unichar(leftmost_id);
319  int step = uchset.step(str);
320  if (step) {
321  leftmost_id = uchset.unichar_to_id(str, step);
322  }
323  }
324  int end_i = orig_i + alt_word->state(i);
325  if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
326  // Compute proper blob indices.
327  int blob_start = 0;
328  for (int j = 0; j < orig_i; ++j) {
329  blob_start += best_choice->state(j);
330  }
331  int blob_end = blob_start;
332  for (int j = orig_i; j < end_i; ++j) {
333  blob_end += best_choice->state(j);
334  }
335  fixpt->push_back(
336  DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
337  if (stopper_debug_level > 1) {
338  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
339  replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
340  }
341  }
342  orig_i += alt_word->state(i);
343  }
344  }
345  }
346  delete alt_word;
347  }
348  if (output_ambig_words_file_ != nullptr) {
349  fprintf(output_ambig_words_file_, "\n");
350  }
351 
352  for (auto data : ambig_blob_choices) {
353  delete data;
354  }
355  return !ambigs_found;
356 }
#define MAX_AMBIG_SIZE
Definition: ambigs.h:34
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
@ CASE_AMBIG
Definition: ambigs.h:45
@ BCC_AMBIG
Definition: ratngs.h:52
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:623
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:93
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:58
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:160
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:370
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:159
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 381 of file dict.h.

381  {
382  return dawgs_.size();
383  }

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 187 of file permdawg.cpp.

190  {
191  if (debug) {
192  tprintf(
193  "%s permute_choices: char_choice_index=%d"
194  " limit=%g rating=%g, certainty=%g word=%s\n",
195  debug, char_choice_index, *limit, word->rating(), word->certainty(),
196  word->debug_string().c_str());
197  }
198  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199  BLOB_CHOICE_IT blob_choice_it;
200  blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202  (*attempts_left)--;
203  append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204  prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205  more_args);
206  if (*attempts_left <= 0) {
207  if (debug) {
208  tprintf("permute_choices(): attempts_left is 0\n");
209  }
210  break;
211  }
212  }
213  }
214 }
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:224

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 357 of file dict.h.

358  {
359  return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
360  character, character_bytes);
361  }
std::string lang
Definition: ccutil.h:59

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 571 of file dict.cpp.

572  {
573  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
574  // Try to find the edge corresponding to the exact unichar_id and to all the
575  // edges corresponding to the character class of unichar_id.
576  std::vector<UNICHAR_ID> unichar_id_patterns;
577  unichar_id_patterns.push_back(unichar_id);
578  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
579  for (int unichar_id_pattern : unichar_id_patterns) {
580  // On the first iteration check all the outgoing edges.
581  // On the second iteration check all self-loops.
582  for (int k = 0; k < 2; ++k) {
583  EDGE_REF edge = (k == 0)
584  ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
585  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
586  if (edge == NO_EDGE) {
587  continue;
588  }
589  if (dawg_debug_level >= 3) {
590  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
591  edge);
592  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
593  }
594  if (dawg->permuter() > *curr_perm) {
595  *curr_perm = dawg->permuter();
596  }
597  if (dawg->end_of_word(edge)) {
598  dawg_args->valid_end = true;
599  }
600  dawg_args->updated_dawgs->add_unique(
601  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
602  dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
603  }
604  }
605 }

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 370 of file stopper.cpp.

371  {
372  int num_blobs_to_replace = 0;
373  int begin_blob_index = 0;
374  int i;
375  // Rating and certainty for the new BLOB_CHOICE are derived from the
376  // replaced choices.
377  float new_rating = 0.0f;
378  float new_certainty = 0.0f;
379  BLOB_CHOICE *old_choice = nullptr;
380  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
381  if (i >= wrong_ngram_begin_index) {
382  int num_blobs = werd_choice->state(i);
383  int col = begin_blob_index + num_blobs_to_replace;
384  int row = col + num_blobs - 1;
385  BLOB_CHOICE_LIST *choices = ratings->get(col, row);
386  ASSERT_HOST(choices != nullptr);
387  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
388  ASSERT_HOST(old_choice != nullptr);
389  new_rating += old_choice->rating();
390  new_certainty += old_choice->certainty();
391  num_blobs_to_replace += num_blobs;
392  } else {
393  begin_blob_index += werd_choice->state(i);
394  }
395  }
396  new_certainty /= wrong_ngram_size;
397  // If there is no entry in the ratings matrix, add it.
398  MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
399  if (!coord.Valid(*ratings)) {
400  ratings->IncreaseBandSize(coord.row - coord.col + 1);
401  }
402  if (ratings->get(coord.col, coord.row) == nullptr) {
403  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
404  }
405  BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
406  BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
407  if (choice != nullptr) {
408  // Already there. Upgrade if new rating better.
409  if (new_rating < choice->rating()) {
410  choice->set_rating(new_rating);
411  }
412  if (new_certainty < choice->certainty()) {
413  choice->set_certainty(new_certainty);
414  }
415  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
416  } else {
417  // Need a new choice with the correct_ngram_id.
418  choice = new BLOB_CHOICE(*old_choice);
419  choice->set_unichar_id(correct_ngram_id);
420  choice->set_rating(new_rating);
421  choice->set_certainty(new_certainty);
422  choice->set_classifier(BCC_AMBIG);
423  choice->set_matrix_cell(coord.col, coord.row);
424  BLOB_CHOICE_IT it(new_choices);
425  it.add_to_end(choice);
426  }
427  // Remove current unichar from werd_choice. On the last iteration
428  // set the correct replacement unichar instead of removing a unichar.
429  for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
430  if (replaced_count + 1 == wrong_ngram_size) {
431  werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
432  } else {
433  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
434  }
435  }
436  if (stopper_debug_level >= 1) {
437  werd_choice->print("ReplaceAmbig() ");
438  tprintf("Modified blob_choices: ");
439  print_ratings_list("\n", new_choices, getUnicharset());
440  }
441 }
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 27 of file hyphen.cpp.

27  {
28  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
29  if (hyphen_word_ != nullptr) {
30  delete hyphen_word_;
31  hyphen_word_ = nullptr;
32  hyphen_active_dawgs_.clear();
33  }
34  }
35  if (hyphen_debug_level) {
36  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n", last_word_on_line_,
37  last_word_on_line);
38  }
39  last_word_on_line_ = last_word_on_line;
40 }

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 297 of file dict.h.

297  {
298  if (pending_words_ != nullptr) {
299  pending_words_->clear();
300  }
301  if (document_words_ != nullptr) {
302  document_words_->clear();
303  }
304  }
void clear()
Definition: trie.cpp:50

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 44 of file hyphen.cpp.

44  {
45  if (hyphen_word_ == nullptr) {
46  hyphen_word_ = new WERD_CHOICE(word.unicharset());
47  hyphen_word_->make_bad();
48  }
49  if (hyphen_word_->rating() > word.rating()) {
50  *hyphen_word_ = word;
51  // Remove the last unichar id as it is a hyphen, and remove
52  // any unichar_string/lengths that are present.
53  hyphen_word_->remove_last_unichar_id();
54  hyphen_active_dawgs_ = active_dawgs;
55  }
56  if (hyphen_debug_level) {
57  hyphen_word_->print("set_hyphen_word: ");
58  }
59 }
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:415
void remove_last_unichar_id()
Definition: ratngs.h:451
float rating() const
Definition: ratngs.h:308

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 362 of file stopper.cpp.

362  {
363  reject_offset_ = 0.0;
364 }

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 366 of file stopper.cpp.

366  {
367  reject_offset_ = stopper_phase2_certainty_rejection_offset;
368 }

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 180 of file dict.cpp.

180  {
181  if (dawgs_.size() != 0) {
182  this->End();
183  }
184 
185  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
186  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
187  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
188  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
189 
190  if (dawg_cache != nullptr) {
191  dawg_cache_ = dawg_cache;
192  dawg_cache_is_ours_ = false;
193  } else {
194  dawg_cache_ = new DawgCache();
195  dawg_cache_is_ours_ = true;
196  }
197 }

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 374 of file dict.h.

374  {
375  wildcard_unichar_id_ = id;
376  }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 469 of file dict.h.

469  {
470  wordseg_rating_adjust_factor_ = f;
471  }

◆ STRING_VAR_H() [1/6]

tesseract::Dict::STRING_VAR_H ( output_ambig_words_file  )

◆ STRING_VAR_H() [2/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_file  )

◆ STRING_VAR_H() [3/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_suffix  )

◆ STRING_VAR_H() [4/6]

tesseract::Dict::STRING_VAR_H ( user_words_file  )

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class.

◆ STRING_VAR_H() [5/6]

tesseract::Dict::STRING_VAR_H ( user_words_suffix  )

◆ STRING_VAR_H() [6/6]

tesseract::Dict::STRING_VAR_H ( word_to_debug  )

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 464 of file stopper.cpp.

464  {
465  float Certainty;
466  float WorstCertainty = FLT_MAX;
467  float CertaintyThreshold;
468  double TotalCertainty;
469  double TotalCertaintySquared;
470  double Variance;
471  float Mean, StdDev;
472  int word_length = word.length();
473 
474  if (word_length < 3) {
475  return true;
476  }
477 
478  TotalCertainty = TotalCertaintySquared = 0.0;
479  for (int i = 0; i < word_length; ++i) {
480  Certainty = word.certainty(i);
481  TotalCertainty += Certainty;
482  TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483  if (Certainty < WorstCertainty) {
484  WorstCertainty = Certainty;
485  }
486  }
487 
488  // Subtract off worst certainty from statistics.
489  word_length--;
490  TotalCertainty -= WorstCertainty;
491  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
492 
493  Mean = TotalCertainty / word_length;
494  Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
495  (word_length * (word_length - 1)));
496  if (Variance < 0.0) {
497  Variance = 0.0;
498  }
499  StdDev = sqrt(Variance);
500 
501  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
502  if (CertaintyThreshold > stopper_nondict_certainty_base) {
503  CertaintyThreshold = stopper_nondict_certainty_base;
504  }
505 
506  if (word.certainty() < CertaintyThreshold) {
507  if (stopper_debug_level >= 1) {
508  tprintf(
509  "Stopper: Non-uniform certainty = %4.1f"
510  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
511  word.certainty(), Mean, StdDev, CertaintyThreshold);
512  }
513  return false;
514  } else {
515  return true;
516  }
517 }
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1663

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 182 of file dict.h.

182  {
183  if (word.rating() < best_choice->rating()) {
184  *best_choice = word;
185  }
186  }

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 836 of file dict.cpp.

836  {
837  if (bigram_dawg_ == nullptr) {
838  return false;
839  }
840 
841  // Extract the core word from the middle of each word with any digits
842  // replaced with question marks.
843  unsigned w1start, w1end, w2start, w2end;
844  word1.punct_stripped(&w1start, &w1end);
845  word2.punct_stripped(&w2start, &w2end);
846 
847  // We don't want to penalize a single guillemet, hyphen, etc.
848  // But our bigram list doesn't have any information about punctuation.
849  if (w1start >= w1end) {
850  return word1.length() < 3;
851  }
852  if (w2start >= w2end) {
853  return word2.length() < 3;
854  }
855 
856  const UNICHARSET &uchset = getUnicharset();
857  std::vector<UNICHAR_ID> bigram_string;
858  bigram_string.reserve(w1end + w2end + 1);
859  for (auto i = w1start; i < w1end; i++) {
860  const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
861  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
862  bigram_string.push_back(question_unichar_id_);
863  } else {
864  bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
865  }
866  }
867  bigram_string.push_back(UNICHAR_SPACE);
868  for (auto i = w2start; i < w2end; i++) {
869  const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
870  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
871  bigram_string.push_back(question_unichar_id_);
872  } else {
873  bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
874  }
875  }
876  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
877  for (int i : bigram_string) {
878  normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
879  }
880  return bigram_dawg_->word_in_dawg(normalized_word);
881 }
@ UNICHAR_SPACE
Definition: unicharset.h:36
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:869

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 883 of file dict.cpp.

883  {
884  if (word.empty()) {
885  return NO_PERM;
886  }
887  WERD_CHOICE new_word(word.unicharset());
888  auto last_index = word.length() - 1;
889  int new_len = 0;
890  for (unsigned i = 0; i <= last_index; ++i) {
891  UNICHAR_ID unichar_id = (word.unichar_id(i));
892  if (getUnicharset().get_ispunctuation(unichar_id)) {
893  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
894  } else if (!getUnicharset().get_isalpha(unichar_id) &&
895  !getUnicharset().get_isdigit(unichar_id)) {
896  return false; // neither punc, nor alpha, nor digit
897  } else if ((new_len = new_word.length()) == 0 ||
898  new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
899  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
900  }
901  }
902  for (unsigned i = 0; i < dawgs_.size(); ++i) {
903  if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
904  dawgs_[i]->word_in_dawg(new_word)) {
905  return true;
906  }
907  }
908  return false;
909 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 450 of file dict.h.

450  {
451  WERD_CHOICE word(string, getUnicharset());
452  return valid_word(word);
453  }

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 443 of file dict.h.

443  {
444  return valid_word(word, false); // return NO_PERM for words with digits
445  }

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 801 of file dict.cpp.

801  {
802  const WERD_CHOICE *word_ptr = &word;
803  WERD_CHOICE temp_word(word.unicharset());
804  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
805  copy_hyphen_info(&temp_word);
806  temp_word += word;
807  word_ptr = &temp_word;
808  }
809  if (word_ptr->empty()) {
810  return NO_PERM;
811  }
812  // Allocate vectors for holding current and updated
813  // active_dawgs and initialize them.
814  DawgPositionVector active_dawgs[2];
815  init_active_dawgs(&(active_dawgs[0]), false);
816  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
817  int last_index = word_ptr->length() - 1;
818  // Call letter_is_okay for each letter in the word.
819  for (int i = hyphen_base_size(); i <= last_index; ++i) {
820  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
821  i == last_index))) {
822  break;
823  }
824  // Swap active_dawgs, constraints with the corresponding updated vector.
825  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
826  dawg_args.updated_dawgs = &(active_dawgs[0]);
827  ++(dawg_args.active_dawgs);
828  } else {
829  ++(dawg_args.updated_dawgs);
830  dawg_args.active_dawgs = &(active_dawgs[0]);
831  }
832  }
833  return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
834 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 446 of file dict.h.

446  {
447  return valid_word(word, true); // return NUMBER_PERM for valid numbers
448  }

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 437 of file dict.h.

437  {
438  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
439  perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
440  (numbers_ok && perm == NUMBER_PERM));
441  }
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ NUMBER_PERM
Definition: ratngs.h:238

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 377 of file dict.h.

377  {
378  return wildcard_unichar_id_;
379  }

Member Data Documentation

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 210 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 345 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 354 of file dict.h.


The documentation for this class was generated from the following files: