tesseract  5.0.0
reject.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: reject.cpp (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "reject.h"
25 
26 #ifdef DISABLED_LEGACY_ENGINE
27 
28 # include "tesseractclass.h"
29 
30 namespace tesseract {
31 
32 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
33  const WERD_CHOICE &word = *werd_res->best_choice;
34  int dict_word_type = werd_res->tesseract->dict_word(word);
35  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
36 }
37 } // namespace tesseract
38 
39 #else
40 
41 # include "control.h"
42 # include "docqual.h"
43 # include "tesseractclass.h"
44 # include "tessvars.h"
45 
46 # include "helpers.h"
47 
48 # include <algorithm> // for std::sort
49 # include <cctype>
50 # include <cerrno>
51 # include <cstring>
52 # include <vector> // for std::vector
53 
54 namespace tesseract {
55 
56 /*************************************************************************
57  * set_done()
58  *
59  * Set the done flag based on the word acceptability criteria
60  *************************************************************************/
61 
62 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
63  word->done =
64  word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67  word->best_choice->permuter() == FREQ_DAWG_PERM ||
69  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70  one_ell_conflict(word, false)) {
71  if (tessedit_rejection_debug) {
72  tprintf("one_ell_conflict detected\n");
73  }
74  word->done = false;
75  }
76  if (word->done &&
77  ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78  if (tessedit_rejection_debug) {
79  tprintf("non-dict or ambig word detected\n");
80  }
81  word->done = false;
82  }
83  if (tessedit_rejection_debug) {
84  tprintf("set_done(): done=%d\n", word->done);
85  word->best_choice->print("");
86  }
87 }
88 
89 /*************************************************************************
90  * make_reject_map()
91  *
92  * Sets the done flag to indicate whether the resylt is acceptable.
93  *
94  * Sets a reject map for the word.
95  *************************************************************************/
96 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
97  flip_0O(word);
98  check_debug_pt(word, -1); // For trap only
99  set_done(word, pass); // Set acceptance
100  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101  reject_blanks(word);
102  /*
103 0: Rays original heuristic - the baseline
104 */
105  if (tessedit_reject_mode == 0) {
106  if (!word->done) {
107  reject_poor_matches(word);
108  }
109  } else if (tessedit_reject_mode == 5) {
110  /*
111 5: Reject I/1/l from words where there is no strong contextual confirmation;
112  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113  and the whole of any words which are very small
114 */
115  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
117  } else {
118  one_ell_conflict(word, true);
119  /*
120  Originally the code here just used the done flag. Now I have duplicated
121  and unpacked the conditions for setting the done flag so that each
122  mechanism can be turned on or off independently. This works WITHOUT
123  affecting the done flag setting.
124 */
125  if (rej_use_tess_accepted && !word->tess_accepted) {
127  }
128 
129  if (rej_use_tess_blanks &&
130  (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
132  }
133 
134  WERD_CHOICE *best_choice = word->best_choice;
135  if (rej_use_good_perm) {
136  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137  best_choice->permuter() == FREQ_DAWG_PERM ||
138  best_choice->permuter() == USER_DAWG_PERM) &&
139  (!rej_use_sensible_wd ||
140  acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141  best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142  // PASSED TEST
143  } else if (best_choice->permuter() == NUMBER_PERM) {
144  if (rej_alphas_in_number_perm) {
145  for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146  offset += best_choice->unichar_lengths()[i++]) {
147  if (word->reject_map[i].accepted() &&
148  word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149  best_choice->unichar_lengths()[i])) {
150  word->reject_map[i].setrej_bad_permuter();
151  }
152  // rej alpha
153  }
154  }
155  } else {
157  }
158  }
159  /* Ambig word rejection was here once !!*/
160  }
161  } else {
162  tprintf("BAD tessedit_reject_mode\n");
163  ASSERT_HOST("Fatal error encountered!" == nullptr);
164  }
165 
166  if (tessedit_image_border > -1) {
167  reject_edge_blobs(word);
168  }
169 
170  check_debug_pt(word, 10);
171  if (tessedit_rejection_debug) {
172  tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173  tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
174  word->best_choice->rating());
175  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176  }
177 
178  flip_hyphens(word);
179  check_debug_pt(word, 20);
180 }
181 
182 void reject_blanks(WERD_RES *word) {
183  int16_t i;
184  int16_t offset;
185 
186  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
187  offset += word->best_choice->unichar_lengths()[i], i += 1) {
188  if (word->best_choice->unichar_string()[offset] == ' ') {
189  // rej unrecognised blobs
190  word->reject_map[i].setrej_tess_failure();
191  }
192  }
193 }
194 
196  int16_t i;
197  int16_t offset;
198 
199  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202  // rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict();
204  }
205  }
206 }
207 
209  float threshold = compute_reject_threshold(word->best_choice);
210  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
211  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
212  word->reject_map[i].setrej_tess_failure();
213  } else if (word->best_choice->certainty(i) < threshold) {
214  word->reject_map[i].setrej_poor_match();
215  }
216  }
217 }
218 
219 /**********************************************************************
220  * compute_reject_threshold
221  *
222  * Set a rejection threshold for this word.
223  * Initially this is a trivial function which looks for the largest
224  * gap in the certainty value.
225  **********************************************************************/
226 
228  float threshold; // rejection threshold
229  float bestgap = 0.0f; // biggest gap
230  float gapstart; // bottom of gap
231 
232  auto blob_count = word->length();
233  std::vector<float> ratings;
234  ratings.reserve(blob_count);
235  for (unsigned i = 0; i < blob_count; ++i) {
236  ratings.push_back(word->certainty(i));
237  }
238  std::sort(ratings.begin(), ratings.end());
239  gapstart = ratings[0] - 1; // all reject if none better
240  if (blob_count >= 3) {
241  for (unsigned index = 0; index < blob_count - 1; index++) {
242  if (ratings[index + 1] - ratings[index] > bestgap) {
243  bestgap = ratings[index + 1] - ratings[index];
244  // find biggest
245  gapstart = ratings[index];
246  }
247  }
248  }
249  threshold = gapstart + bestgap / 2;
250 
251  return threshold;
252 }
253 
254 /*************************************************************************
255  * reject_edge_blobs()
256  *
257  * If the word is perilously close to the edge of the image, reject those blobs
258  * in the word which are too close to the edge as they could be clipped.
259  *************************************************************************/
261  TBOX word_box = word->word->bounding_box();
262  // Use the box_word as it is already denormed back to image coordinates.
263  int blobcount = word->box_word->length();
264 
265  if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268  ASSERT_HOST(word->reject_map.length() == blobcount);
269  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270  TBOX blob_box = word->box_word->BlobBox(blobindex);
271  if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274  word->reject_map[blobindex].setrej_edge_char();
275  // Close to edge
276  }
277  }
278  }
279 }
280 
281 /**********************************************************************
282  * one_ell_conflict()
283  *
284  * Identify words where there is a potential I/l/1 error.
285  * - A bundle of contextual heuristics!
286  **********************************************************************/
287 bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
288  const char *word;
289  const char *lengths;
290  int16_t word_len; // its length
291  int16_t first_alphanum_index_;
292  int16_t first_alphanum_offset_;
293  int16_t i;
294  int16_t offset;
295  bool non_conflict_set_char; // non conf set a/n?
296  bool conflict = false;
297  bool allow_1s;
298  ACCEPTABLE_WERD_TYPE word_type;
299  bool dict_perm_type;
300  bool dict_word_ok;
301  int dict_word_type;
302 
303  word = word_res->best_choice->unichar_string().c_str();
304  lengths = word_res->best_choice->unichar_lengths().c_str();
305  word_len = strlen(lengths);
306  /*
307  If there are no occurrences of the conflict set characters then the word
308  is OK.
309 */
310  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
311  return false;
312  }
313 
314  /*
315  There is a conflict if there are NO other (confirmed) alphanumerics apart
316  from those in the conflict set.
317 */
318 
319  for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
320  offset += lengths[i++]) {
321  non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
322  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
323  !conflict_set_I_l_1.contains(word[offset]);
324  }
325  if (!non_conflict_set_char) {
326  if (update_map) {
327  reject_I_1_L(word_res);
328  }
329  return true;
330  }
331 
332  /*
333  If the word is accepted by a dawg permuter, and the first alpha character
334  is "I" or "l", check to see if the alternative is also a dawg word. If it
335  is, then there is a potential error otherwise the word is ok.
336 */
337 
338  dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
339  (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
340  (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
341  (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
342  dict_word_type = dict_word(*(word_res->best_choice));
343  dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
344 
345  if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346  (dict_perm_type && dict_word_ok)) {
347  first_alphanum_index_ = first_alphanum_index(word, lengths);
348  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
349  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
350  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
351  if (safe_dict_word(word_res) > 0) {
352  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
353  if (update_map) {
354  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
355  }
356  return true;
357  } else {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
359  return false;
360  }
361  }
362 
363  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
364  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365  if (safe_dict_word(word_res) > 0) {
366  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
367  if (update_map) {
368  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
369  }
370  return true;
371  } else {
372  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
373  return false;
374  }
375  }
376  return false;
377  }
378 
379  /*
380  NEW 1Il code. The old code relied on permuter types too much. In fact,
381  tess will use TOP_CHOICE permute for good things like "palette".
382  In this code the string is examined independently to see if it looks like
383  a well formed word.
384 */
385 
386  /*
387  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
388  dictionary word.
389 */
390  first_alphanum_index_ = first_alphanum_index(word, lengths);
391  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
392  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
393  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
394  if (safe_dict_word(word_res) > 0) {
395  return false;
396  } else {
397  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
398  }
399  } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
400  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
401  if (safe_dict_word(word_res) > 0) {
402  return false;
403  } else {
404  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405  }
406  }
407  /*
408  For strings containing digits:
409  If there are no alphas OR the numeric permuter liked the word,
410  reject any non 1 conflict chs
411  Else reject all conflict chs
412 */
413  if (word_contains_non_1_digit(word, lengths)) {
414  allow_1s =
415  (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
416 
417  int16_t offset;
418  conflict = false;
419  for (i = 0, offset = 0; word[offset] != '\0';
420  offset += word_res->best_choice->unichar_lengths()[i++]) {
421  if ((!allow_1s || (word[offset] != '1')) &&
422  conflict_set_I_l_1.contains(word[offset])) {
423  if (update_map) {
424  word_res->reject_map[i].setrej_1Il_conflict();
425  }
426  conflict = true;
427  }
428  }
429  return conflict;
430  }
431  /*
432  For anything else. See if it conforms to an acceptable word type. If so,
433  treat accordingly.
434 */
435  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
436  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
437  first_alphanum_index_ = first_alphanum_index(word, lengths);
438  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
439  if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
440  if (update_map) {
441  word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
442  }
443  return true;
444  } else {
445  return false;
446  }
447  } else if (word_type == AC_UPPER_CASE) {
448  return false;
449  } else {
450  if (update_map) {
451  reject_I_1_L(word_res);
452  }
453  return true;
454  }
455 }
456 
457 int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
458  int16_t i;
459  int16_t offset;
460 
461  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
462  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
463  unicharset.get_isdigit(word + offset, word_lengths[i])) {
464  return i;
465  }
466  }
467  return -1;
468 }
469 
470 int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
471  int16_t i;
472  int16_t offset;
473 
474  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476  unicharset.get_isdigit(word + offset, word_lengths[i])) {
477  return offset;
478  }
479  }
480  return -1;
481 }
482 
483 int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
484  int16_t i;
485  int16_t offset;
486  int16_t count = 0;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
490  count++;
491  }
492  }
493  return count;
494 }
495 
496 bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
497  int16_t i;
498  int16_t offset;
499 
500  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
501  if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
502  (word_lengths[i] != 1 || word[offset] != '1')) {
503  return true;
504  }
505  }
506  return false;
507 }
508 
509 /*************************************************************************
510  * dont_allow_1Il()
511  * Don't unreject LONE accepted 1Il conflict set chars
512  *************************************************************************/
514  int word_len = word->reject_map.length();
515  const char *s = word->best_choice->unichar_string().c_str();
516  const char *lengths = word->best_choice->unichar_lengths().c_str();
517  bool accepted_1Il = false;
518 
519  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
520  if (word->reject_map[i].accepted()) {
521  if (conflict_set_I_l_1.contains(s[offset])) {
522  accepted_1Il = true;
523  } else {
524  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
525  word->uch_set->get_isdigit(s + offset, lengths[i])) {
526  return; // >=1 non 1Il ch accepted
527  }
528  }
529  }
530  }
531  if (!accepted_1Il) {
532  return; // Nothing to worry about
533  }
534 
535  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
536  if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
537  word->reject_map[i].setrej_postNN_1Il();
538  }
539  }
540 }
541 
543  int count = 0;
544  const WERD_CHOICE *best_choice = word_res->best_choice;
545  for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
546  if ((word_res->reject_map[i].accepted()) &&
547  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
548  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
549  count++;
550  }
551  }
552  return count;
553 }
554 
555 // reject all if most rejected.
557  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
558 
559  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
560  rej_whole_of_mostly_reject_word_fract) {
562  }
563 }
564 
566  if (word->best_choice->unichar_lengths().length() <= 1) {
567  return false;
568  }
569 
570  if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
571  return false;
572  }
573 
574  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
575  for (unsigned i = 1; i < word->best_choice->length(); ++i) {
576  if (word->best_choice->unichar_id(i) != uch_id) {
577  return false;
578  }
579  }
580 
581  int16_t char_quality;
582  int16_t accepted_char_quality;
583  word_char_quality(word, &char_quality, &accepted_char_quality);
584 
585  if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
586  (char_quality == accepted_char_quality)) {
587  return true;
588  } else {
589  return false;
590  }
591 }
592 
593 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
594  const WERD_CHOICE &word = *werd_res->best_choice;
595  int dict_word_type = werd_res->tesseract->dict_word(word);
596  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
597 }
598 
599 // Note: After running this function word_res->ratings
600 // might not contain the right BLOB_CHOICE corresponding to each character
601 // in word_res->best_choice.
603  WERD_CHOICE *best_choice = word_res->best_choice;
604  int prev_right = -9999;
605  int next_left;
606  TBOX out_box;
607  float aspect_ratio;
608 
609  if (tessedit_lower_flip_hyphen <= 1) {
610  return;
611  }
612 
613  auto num_blobs = word_res->rebuild_word->NumBlobs();
614  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
615  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
616  TBLOB *blob = word_res->rebuild_word->blobs[i];
617  out_box = blob->bounding_box();
618  if (i + 1 == num_blobs) {
619  next_left = 9999;
620  } else {
621  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
622  }
623  // Don't touch small or touching blobs - it is too dangerous.
624  if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
625  (out_box.right() < next_left)) {
626  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
627  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
628  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
629  word_res->uch_set->contains_unichar_id(unichar_dash) &&
630  word_res->uch_set->get_enabled(unichar_dash)) {
631  /* Certain HYPHEN */
632  best_choice->set_unichar_id(unichar_dash, i);
633  if (word_res->reject_map[i].rejected()) {
634  word_res->reject_map[i].setrej_hyphen_accept();
635  }
636  }
637  if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
638  // Suspected HYPHEN
639  word_res->reject_map[i].setrej_hyphen();
640  }
641  } else if (best_choice->unichar_id(i) == unichar_dash) {
642  if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
643  word_res->reject_map[i].setrej_hyphen_accept();
644  }
645  // Certain HYPHEN
646 
647  if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
648  // Suspected HYPHEN
649  word_res->reject_map[i].setrej_hyphen();
650  }
651  }
652  }
653  prev_right = out_box.right();
654  }
655 }
656 
657 // Note: After running this function word_res->ratings
658 // might not contain the right BLOB_CHOICE corresponding to each character
659 // in word_res->best_choice.
660 void Tesseract::flip_0O(WERD_RES *word_res) {
661  WERD_CHOICE *best_choice = word_res->best_choice;
662  TBOX out_box;
663 
664  if (!tessedit_flip_0O) {
665  return;
666  }
667 
668  auto num_blobs = word_res->rebuild_word->NumBlobs();
669  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
670  TBLOB *blob = word_res->rebuild_word->blobs[i];
671  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
672  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
673  out_box = blob->bounding_box();
674  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
675  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
676  return; // Beware words with sub/superscripts
677  }
678  }
679  }
680  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
681  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
682  if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
683  unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
684  return; // 0 or O are not present/enabled in unicharset
685  }
686  for (unsigned i = 1; i < best_choice->length(); ++i) {
687  if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
688  /* A0A */
689  if ((i + 1) < best_choice->length() &&
690  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
691  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
692  best_choice->set_unichar_id(unichar_O, i);
693  }
694  /* A00A */
695  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
696  (i + 1) < best_choice->length() &&
697  (best_choice->unichar_id(i + 1) == unichar_0 ||
698  best_choice->unichar_id(i + 1) == unichar_O) &&
699  (i + 2) < best_choice->length() &&
700  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
701  best_choice->set_unichar_id(unichar_O, i);
702  i++;
703  }
704  /* AA0<non digit or end of word> */
705  if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
707  (((i + 1) < best_choice->length() &&
708  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
709  !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
710  !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
711  (i == best_choice->length() - 1))) {
712  best_choice->set_unichar_id(unichar_O, i);
713  }
714  /* 9O9 */
715  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
716  (i + 1) < best_choice->length() &&
717  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
718  best_choice->set_unichar_id(unichar_0, i);
719  }
720  /* 9OOO */
721  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
722  (i + 2) < best_choice->length() &&
723  (best_choice->unichar_id(i + 1) == unichar_0 ||
724  best_choice->unichar_id(i + 1) == unichar_O) &&
725  (best_choice->unichar_id(i + 2) == unichar_0 ||
726  best_choice->unichar_id(i + 2) == unichar_O)) {
727  best_choice->set_unichar_id(unichar_0, i);
728  best_choice->set_unichar_id(unichar_0, i + 1);
729  best_choice->set_unichar_id(unichar_0, i + 2);
730  i += 2;
731  }
732  /* 9OO<non upper> */
733  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
734  (i + 2) < best_choice->length() &&
735  (best_choice->unichar_id(i + 1) == unichar_0 ||
736  best_choice->unichar_id(i + 1) == unichar_O) &&
737  !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
738  best_choice->set_unichar_id(unichar_0, i);
739  best_choice->set_unichar_id(unichar_0, i + 1);
740  i++;
741  }
742  /* 9O<non upper> */
743  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
744  (i + 1) < best_choice->length() &&
745  !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
746  best_choice->set_unichar_id(unichar_0, i);
747  }
748  /* 9[.,]OOO.. */
749  if ((i > 1) &&
750  (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
751  word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
752  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
753  best_choice->unichar_id(i - 2) == unichar_O)) {
754  if (best_choice->unichar_id(i - 2) == unichar_O) {
755  best_choice->set_unichar_id(unichar_0, i - 2);
756  }
757  while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
758  best_choice->unichar_id(i) == unichar_0)) {
759  best_choice->set_unichar_id(unichar_0, i);
760  i++;
761  }
762  i--;
763  }
764  }
765  }
766 }
767 
768 bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
769  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
770 }
771 
772 bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
773  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
774 }
775 } // namespace tesseract
776 
777 #endif // def DISABLED_LEGACY_ENGINE
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:59
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:227
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kBlnXHeight
Definition: normalis.h:33
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:208
int UNICHAR_ID
Definition: unichar.h:36
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ NUMBER_PERM
Definition: ratngs.h:238
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ DOC_DAWG_PERM
Definition: ratngs.h:241
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
const int kBlnBaselineOffset
Definition: normalis.h:34
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:182
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:457
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:260
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:470
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:768
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:483
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:513
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:375
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1704
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:287
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593
void set_done(WERD_RES *word, int16_t pass)
Definition: reject.cpp:62
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:772
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1811
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:602
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:195
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:556
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:496
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:565
void flip_0O(WERD_RES *word)
Definition: reject.cpp:660
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96
TBOX bounding_box() const
Definition: blobs.cpp:466
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84
unsigned length() const
Definition: boxword.h:81
float y_scale() const
Definition: normalis.h:262
float x_scale() const
Definition: normalis.h:259
tesseract::Tesseract * tesseract
Definition: pageres.h:278
WERD_CHOICE * best_choice
Definition: pageres.h:239
const UNICHARSET * uch_set
Definition: pageres.h:201
tesseract::BoxWord * box_word
Definition: pageres.h:270
TWERD * rebuild_word
Definition: pageres.h:264
float certainty() const
Definition: ratngs.h:311
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
uint8_t permuter() const
Definition: ratngs.h:327
bool dangerous_ambig_found() const
Definition: ratngs.h:344
const std::string & unichar_lengths() const
Definition: ratngs.h:529
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
float rating() const
Definition: ratngs.h:308
std::string & unichar_string()
Definition: ratngs.h:515
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:139
int16_t reject_count() const
Definition: rejctmap.h:339
void rej_word_contains_blanks()
Definition: rejctmap.cpp:147
void rej_word_small_xht()
Definition: rejctmap.cpp:127
uint16_t length() const
Definition: rejctmap.h:333
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
void rej_word_bad_permuter()
Definition: rejctmap.cpp:155
void rej_word_mostly_rej()
Definition: rejctmap.cpp:179
TBOX bounding_box() const
Definition: werd.cpp:155
UNICHARSET unicharset
Definition: ccutil.h:61
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:912
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86