tesseract  5.0.0
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1994, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "docqual.h"
20 #include <cctype>
21 #include "reject.h"
22 #include "tesseractclass.h"
23 #include "tessvars.h"
24 
25 namespace tesseract {
26 
27 static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
28  ++match_count;
29 }
30 
31 static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
32  int index) {
33  if (word->reject_map[index].accepted()) {
34  ++accepted_match_count;
35  }
36  ++match_count;
37 }
38 
39 static void acceptIfGoodQuality(WERD_RES *word, int index) {
40  if (word->reject_map[index].accept_if_good_quality()) {
41  word->reject_map[index].setrej_quality_accept();
42  }
43 }
44 
45 /*************************************************************************
46  * word_blob_quality()
47  * How many blobs in the box_word are identical to those of the inword?
48  * ASSUME blobs in both initial word and box_word are in ascending order of
49  * left hand blob edge.
50  *************************************************************************/
52  int16_t match_count = 0;
53  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54  !word->rebuild_word->blobs.empty()) {
55  using namespace std::placeholders; // for _1
57  std::bind(countMatchingBlobs, match_count, _1));
58  }
59  return match_count;
60 }
61 
63  int16_t i = 0;
64  int16_t err_count = 0;
65 
66  if (word->rebuild_word != nullptr) {
67  for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
68  TBLOB *blob = word->rebuild_word->blobs[b];
69  err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
70  i++;
71  }
72  }
73  return err_count;
74 }
75 
76 /*************************************************************************
77  * word_char_quality()
78  * Combination of blob quality and outline quality - how many good chars are
79  * there? - I.e chars which pass the blob AND outline tests.
80  *************************************************************************/
81 void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
82  int16_t *accepted_match_count) {
83  *match_count = 0;
84  *accepted_match_count = 0;
85  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
86  !word->rebuild_word->blobs.empty()) {
87  using namespace std::placeholders; // for _1
89  *word->rebuild_word,
90  std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
91  }
92 }
93 
94 /*************************************************************************
95  * unrej_good_chs()
96  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
97  *************************************************************************/
99  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
100  word->rebuild_word->blobs.empty()) {
101  using namespace std::placeholders; // for _1
103  std::bind(acceptIfGoodQuality, word, _1));
104  }
105 }
106 
107 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
108  int expected_outline_count;
109 
110  if (outlines_odd.contains(c)) {
111  return 0; // Don't use this char
112  } else if (outlines_2.contains(c)) {
113  expected_outline_count = 2;
114  } else {
115  expected_outline_count = 1;
116  }
117  return abs(outline_count - expected_outline_count);
118 }
119 
120 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
121  if ((tessedit_good_quality_unrej && good_quality_doc)) {
122  unrej_good_quality_words(page_res_it);
123  }
124  doc_and_block_rejection(page_res_it, good_quality_doc);
125  if (unlv_tilde_crunching) {
126  tilde_crunch(page_res_it);
127  tilde_delete(page_res_it);
128  }
129 }
130 
131 /*************************************************************************
132  * unrej_good_quality_words()
133  * Accept potential rejects in words which pass the following checks:
134  * - Contains a potential reject
135  * - Word looks like a sensible alpha word.
136  * - Word segmentation is the same as the original image
137  * - All characters have the expected number of outlines
138  * NOTE - the rejection counts are recalculated after unrejection
139  * - CAN'T do it in a single pass without a bit of fiddling
140  * - keep it simple but inefficient
141  *************************************************************************/
142 void Tesseract::unrej_good_quality_words( // unreject potential
143  PAGE_RES_IT &page_res_it) {
144  WERD_RES *word;
145  ROW_RES *current_row;
146  BLOCK_RES *current_block;
147  int i;
148 
149  page_res_it.restart_page();
150  while (page_res_it.word() != nullptr) {
151  check_debug_pt(page_res_it.word(), 100);
152  if (bland_unrej) {
153  word = page_res_it.word();
154  for (i = 0; i < word->reject_map.length(); i++) {
155  if (word->reject_map[i].accept_if_good_quality()) {
156  word->reject_map[i].setrej_quality_accept();
157  }
158  }
159  page_res_it.forward();
160  } else if ((page_res_it.row()->char_count > 0) &&
161  ((page_res_it.row()->rej_count /
162  static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
163  word = page_res_it.word();
165  (tessedit_unrej_any_wd ||
166  acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
167  word->best_choice->unichar_lengths().c_str()) !=
168  AC_UNACCEPTABLE)) {
169  unrej_good_chs(word);
170  }
171  page_res_it.forward();
172  } else {
173  // Skip to end of dodgy row.
174  current_row = page_res_it.row();
175  while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
176  page_res_it.forward();
177  }
178  }
179  check_debug_pt(page_res_it.word(), 110);
180  }
181  page_res_it.restart_page();
182  page_res_it.page_res->char_count = 0;
183  page_res_it.page_res->rej_count = 0;
184  current_block = nullptr;
185  current_row = nullptr;
186  while (page_res_it.word() != nullptr) {
187  if (current_block != page_res_it.block()) {
188  current_block = page_res_it.block();
189  current_block->char_count = 0;
190  current_block->rej_count = 0;
191  }
192  if (current_row != page_res_it.row()) {
193  current_row = page_res_it.row();
194  current_row->char_count = 0;
195  current_row->rej_count = 0;
196  current_row->whole_word_rej_count = 0;
197  }
198  page_res_it.rej_stat_word();
199  page_res_it.forward();
200  }
201 }
202 
203 /*************************************************************************
204  * doc_and_block_rejection()
205  *
206  * If the page has too many rejects - reject all of it.
207  * If any block has too many rejects - reject all words in the block
208  *************************************************************************/
209 
210 void Tesseract::doc_and_block_rejection( // reject big chunks
211  PAGE_RES_IT &page_res_it, bool good_quality_doc) {
212  int16_t block_no = 0;
213  int16_t row_no = 0;
214  BLOCK_RES *current_block;
215  ROW_RES *current_row;
216 
217  bool rej_word;
218  bool prev_word_rejected;
219  int16_t char_quality = 0;
220  int16_t accepted_char_quality;
221 
222  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
223  tessedit_reject_doc_percent) {
224  reject_whole_page(page_res_it);
225  if (tessedit_debug_doc_rejection) {
226  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
227  page_res_it.page_res->rej_count);
228  }
229  } else {
230  if (tessedit_debug_doc_rejection) {
231  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
232  page_res_it.page_res->rej_count);
233  }
234 
235  /* Walk blocks testing for block rejection */
236 
237  page_res_it.restart_page();
238  WERD_RES *word;
239  while ((word = page_res_it.word()) != nullptr) {
240  current_block = page_res_it.block();
241  block_no = current_block->block->pdblk.index();
242  if (current_block->char_count > 0 &&
243  (current_block->rej_count * 100.0 / current_block->char_count) >
244  tessedit_reject_block_percent) {
245  if (tessedit_debug_block_rejection) {
246  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
247  current_block->char_count, current_block->rej_count);
248  }
249  prev_word_rejected = false;
250  while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
251  if (tessedit_preserve_blk_rej_perfect_wds) {
252  rej_word = word->reject_map.reject_count() > 0 ||
253  word->reject_map.length() < tessedit_preserve_min_wd_len;
254  if (rej_word && tessedit_dont_blkrej_good_wds &&
255  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
256  acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
257  word->best_choice->unichar_lengths().c_str()) !=
258  AC_UNACCEPTABLE) {
259  word_char_quality(word, &char_quality, &accepted_char_quality);
260  rej_word = char_quality != word->reject_map.length();
261  }
262  } else {
263  rej_word = true;
264  }
265  if (rej_word) {
266  /*
267  Reject spacing if both current and prev words are rejected.
268  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
269  generated more space errors.
270 */
271  if (tessedit_use_reject_spaces && prev_word_rejected &&
272  page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
273  word->reject_spaces = true;
274  }
276  }
277  prev_word_rejected = rej_word;
278  page_res_it.forward();
279  }
280  } else {
281  if (tessedit_debug_block_rejection) {
282  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
283  page_res_it.block()->char_count, page_res_it.block()->rej_count);
284  }
285 
286  /* Walk rows in block testing for row rejection */
287  row_no = 0;
288  while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
289  current_row = page_res_it.row();
290  row_no++;
291  /* Reject whole row if:
292  fraction of chars on row which are rejected exceed a limit AND
293  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
294  limit
295 */
296  if (current_row->char_count > 0 &&
297  (current_row->rej_count * 100.0 / current_row->char_count) >
298  tessedit_reject_row_percent &&
299  (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
300  tessedit_whole_wd_rej_row_percent) {
301  if (tessedit_debug_block_rejection) {
302  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
303  current_row->char_count, current_row->rej_count);
304  }
305  prev_word_rejected = false;
306  while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
307  /* Preserve words on good docs unless they are mostly rejected*/
308  if (!tessedit_row_rej_good_docs && good_quality_doc) {
309  rej_word = word->reject_map.reject_count() /
310  static_cast<float>(word->reject_map.length()) >
311  tessedit_good_doc_still_rowrej_wd;
312  } else if (tessedit_preserve_row_rej_perfect_wds) {
313  /* Preserve perfect words anyway */
314  rej_word = word->reject_map.reject_count() > 0 ||
315  word->reject_map.length() < tessedit_preserve_min_wd_len;
316  if (rej_word && tessedit_dont_rowrej_good_wds &&
317  word->reject_map.length() >= tessedit_preserve_min_wd_len &&
319  *word->uch_set, word->best_choice->unichar_string().c_str(),
320  word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
321  word_char_quality(word, &char_quality, &accepted_char_quality);
322  rej_word = char_quality != word->reject_map.length();
323  }
324  } else {
325  rej_word = true;
326  }
327  if (rej_word) {
328  /*
329  Reject spacing if both current and prev words are rejected.
330  NOTE - this is NOT restricted to FUZZY spaces. - When tried
331  this generated more space errors.
332 */
333  if (tessedit_use_reject_spaces && prev_word_rejected &&
334  page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
335  word->reject_spaces = true;
336  }
338  }
339  prev_word_rejected = rej_word;
340  page_res_it.forward();
341  }
342  } else {
343  if (tessedit_debug_block_rejection) {
344  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
345  current_row->char_count, current_row->rej_count);
346  }
347  while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
348  page_res_it.forward();
349  }
350  }
351  }
352  }
353  }
354  }
355 }
356 
357 /*************************************************************************
358  * reject_whole_page()
359  * Don't believe any of it - set the reject map to 00..00 in all words
360  *
361  *************************************************************************/
362 
363 void reject_whole_page(PAGE_RES_IT &page_res_it) {
364  page_res_it.restart_page();
365  while (page_res_it.word() != nullptr) {
366  page_res_it.word()->reject_map.rej_word_doc_rej();
367  page_res_it.forward();
368  }
369  // whole page is rejected
370  page_res_it.page_res->rejected = true;
371 }
372 
374  WERD_RES *word;
375  GARBAGE_LEVEL garbage_level;
376  PAGE_RES_IT copy_it;
377  bool prev_potential_marked = false;
378  bool found_terrible_word = false;
379  bool ok_dict_word;
380 
381  page_res_it.restart_page();
382  while (page_res_it.word() != nullptr) {
383  POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
384  if (pb != nullptr && !pb->IsText()) {
385  page_res_it.forward();
386  continue;
387  }
388  word = page_res_it.word();
389 
390  if (crunch_early_convert_bad_unlv_chs) {
391  convert_bad_unlv_chs(word);
392  }
393 
394  if (crunch_early_merge_tess_fails) {
395  word->merge_tess_fails();
396  }
397 
398  if (word->reject_map.accept_count() != 0) {
399  found_terrible_word = false;
400  // Forget earlier potential crunches
401  prev_potential_marked = false;
402  } else {
403  ok_dict_word = safe_dict_word(word);
404  garbage_level = garbage_word(word, ok_dict_word);
405 
406  if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
407  if (crunch_debug > 0) {
408  tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
409  }
411  if (prev_potential_marked) {
412  while (copy_it.word() != word) {
413  if (crunch_debug > 0) {
414  tprintf("P1 CRUNCHING: \"%s\"\n",
415  copy_it.word()->best_choice->unichar_string().c_str());
416  }
417  copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
418  copy_it.forward();
419  }
420  prev_potential_marked = false;
421  }
422  found_terrible_word = true;
423  } else if ((garbage_level != G_NEVER_CRUNCH) &&
424  (potential_word_crunch(word, garbage_level, ok_dict_word))) {
425  if (found_terrible_word) {
426  if (crunch_debug > 0) {
427  tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
428  }
430  } else if (!prev_potential_marked) {
431  copy_it = page_res_it;
432  prev_potential_marked = true;
433  if (crunch_debug > 1) {
434  tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
435  }
436  }
437  } else {
438  found_terrible_word = false;
439  // Forget earlier potential crunches
440  prev_potential_marked = false;
441  if (crunch_debug > 2) {
442  tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
443  }
444  }
445  }
446  page_res_it.forward();
447  }
448 }
449 
451  float rating_per_ch;
452  int adjusted_len;
453  int crunch_mode = 0;
454 
455  if (word->best_choice->unichar_string().empty() ||
456  (strspn(word->best_choice->unichar_string().c_str(), " ") ==
457  word->best_choice->unichar_string().size())) {
458  crunch_mode = 1;
459  } else {
460  adjusted_len = word->reject_map.length();
461  if (adjusted_len > crunch_rating_max) {
462  adjusted_len = crunch_rating_max;
463  }
464  rating_per_ch = word->best_choice->rating() / adjusted_len;
465 
466  if (rating_per_ch > crunch_terrible_rating) {
467  crunch_mode = 2;
468  } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
469  crunch_mode = 3;
470  } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
471  (garbage_level != G_OK)) {
472  crunch_mode = 4;
473  } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
474  crunch_mode = 5;
475  }
476  }
477  if (crunch_mode > 0) {
478  if (crunch_debug > 2) {
479  tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
480  word->best_choice->unichar_string().c_str());
481  }
482  return true;
483  } else {
484  return false;
485  }
486 }
487 
489  bool ok_dict_word) {
490  float rating_per_ch;
491  int adjusted_len;
492  const char *str = word->best_choice->unichar_string().c_str();
493  const char *lengths = word->best_choice->unichar_lengths().c_str();
494  bool word_crunchable;
495  int poor_indicator_count = 0;
496 
497  word_crunchable =
498  !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
499  (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
500 
501  adjusted_len = word->reject_map.length();
502  if (adjusted_len > 10) {
503  adjusted_len = 10;
504  }
505  rating_per_ch = word->best_choice->rating() / adjusted_len;
506 
507  if (rating_per_ch > crunch_pot_poor_rate) {
508  if (crunch_debug > 2) {
509  tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
510  }
511  poor_indicator_count++;
512  }
513 
514  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
515  if (crunch_debug > 2) {
516  tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
517  }
518  poor_indicator_count++;
519  }
520 
521  if (garbage_level != G_OK) {
522  if (crunch_debug > 2) {
523  tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
524  }
525  poor_indicator_count++;
526  }
527  return poor_indicator_count >= crunch_pot_indicators;
528 }
529 
531  WERD_RES *word;
532  PAGE_RES_IT copy_it;
533  bool deleting_from_bol = false;
534  bool marked_delete_point = false;
535  int16_t debug_delete_mode;
536  CRUNCH_MODE delete_mode;
537  int16_t x_debug_delete_mode;
538  CRUNCH_MODE x_delete_mode;
539 
540  page_res_it.restart_page();
541  while (page_res_it.word() != nullptr) {
542  word = page_res_it.word();
543 
544  delete_mode = word_deletable(word, debug_delete_mode);
545  if (delete_mode != CR_NONE) {
546  if (word->word->flag(W_BOL) || deleting_from_bol) {
547  if (crunch_debug > 0) {
548  tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
549  word->best_choice->unichar_string().c_str());
550  }
551  word->unlv_crunch_mode = delete_mode;
552  deleting_from_bol = true;
553  } else if (word->word->flag(W_EOL)) {
554  if (marked_delete_point) {
555  while (copy_it.word() != word) {
556  x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
557  if (crunch_debug > 0) {
558  tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
559  copy_it.word()->best_choice->unichar_string().c_str());
560  }
561  copy_it.word()->unlv_crunch_mode = x_delete_mode;
562  copy_it.forward();
563  }
564  }
565  if (crunch_debug > 0) {
566  tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
567  word->best_choice->unichar_string().c_str());
568  }
569  word->unlv_crunch_mode = delete_mode;
570  deleting_from_bol = false;
571  marked_delete_point = false;
572  } else {
573  if (!marked_delete_point) {
574  copy_it = page_res_it;
575  marked_delete_point = true;
576  }
577  }
578  } else {
579  deleting_from_bol = false;
580  // Forget earlier potential crunches
581  marked_delete_point = false;
582  }
583  /*
584  The following step has been left till now as the tess fails are used to
585  determine if the word is deletable.
586 */
587  if (!crunch_early_merge_tess_fails) {
588  word->merge_tess_fails();
589  }
590  page_res_it.forward();
591  }
592 }
593 
595  int i;
596  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
597  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
598  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
599  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
600  for (i = 0; i < word_res->reject_map.length(); ++i) {
601  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
602  word_res->best_choice->set_unichar_id(unichar_dash, i);
603  if (word_res->reject_map[i].accepted()) {
604  word_res->reject_map[i].setrej_unlv_rej();
605  }
606  }
607  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
608  word_res->best_choice->set_unichar_id(unichar_space, i);
609  if (word_res->reject_map[i].accepted()) {
610  word_res->reject_map[i].setrej_unlv_rej();
611  }
612  }
613  }
614 }
615 
616 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
617  enum STATES {
618  JUNK,
619  FIRST_UPPER,
620  FIRST_LOWER,
621  FIRST_NUM,
622  SUBSEQUENT_UPPER,
623  SUBSEQUENT_LOWER,
624  SUBSEQUENT_NUM
625  };
626  const char *str = word->best_choice->unichar_string().c_str();
627  const char *lengths = word->best_choice->unichar_lengths().c_str();
628  STATES state = JUNK;
629  int len = 0;
630  int isolated_digits = 0;
631  int isolated_alphas = 0;
632  int bad_char_count = 0;
633  int tess_rejs = 0;
634  int dodgy_chars = 0;
635  int ok_chars;
636  UNICHAR_ID last_char = -1;
637  int alpha_repetition_count = 0;
638  int longest_alpha_repetition_count = 0;
639  int longest_lower_run_len = 0;
640  int lower_string_count = 0;
641  int longest_upper_run_len = 0;
642  int upper_string_count = 0;
643  int total_alpha_count = 0;
644  int total_digit_count = 0;
645 
646  for (; *str != '\0'; str += *(lengths++)) {
647  len++;
648  if (word->uch_set->get_isupper(str, *lengths)) {
649  total_alpha_count++;
650  switch (state) {
651  case SUBSEQUENT_UPPER:
652  case FIRST_UPPER:
653  state = SUBSEQUENT_UPPER;
654  upper_string_count++;
655  if (longest_upper_run_len < upper_string_count) {
656  longest_upper_run_len = upper_string_count;
657  }
658  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
659  alpha_repetition_count++;
660  if (longest_alpha_repetition_count < alpha_repetition_count) {
661  longest_alpha_repetition_count = alpha_repetition_count;
662  }
663  } else {
664  last_char = word->uch_set->unichar_to_id(str, *lengths);
665  alpha_repetition_count = 1;
666  }
667  break;
668  case FIRST_NUM:
669  isolated_digits++;
670  // Fall through.
671  default:
672  state = FIRST_UPPER;
673  last_char = word->uch_set->unichar_to_id(str, *lengths);
674  alpha_repetition_count = 1;
675  upper_string_count = 1;
676  break;
677  }
678  } else if (word->uch_set->get_islower(str, *lengths)) {
679  total_alpha_count++;
680  switch (state) {
681  case SUBSEQUENT_LOWER:
682  case FIRST_LOWER:
683  state = SUBSEQUENT_LOWER;
684  lower_string_count++;
685  if (longest_lower_run_len < lower_string_count) {
686  longest_lower_run_len = lower_string_count;
687  }
688  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
689  alpha_repetition_count++;
690  if (longest_alpha_repetition_count < alpha_repetition_count) {
691  longest_alpha_repetition_count = alpha_repetition_count;
692  }
693  } else {
694  last_char = word->uch_set->unichar_to_id(str, *lengths);
695  alpha_repetition_count = 1;
696  }
697  break;
698  case FIRST_NUM:
699  isolated_digits++;
700  // Fall through.
701  default:
702  state = FIRST_LOWER;
703  last_char = word->uch_set->unichar_to_id(str, *lengths);
704  alpha_repetition_count = 1;
705  lower_string_count = 1;
706  break;
707  }
708  } else if (word->uch_set->get_isdigit(str, *lengths)) {
709  total_digit_count++;
710  switch (state) {
711  case FIRST_NUM:
712  state = SUBSEQUENT_NUM;
713  case SUBSEQUENT_NUM:
714  break;
715  case FIRST_UPPER:
716  case FIRST_LOWER:
717  isolated_alphas++;
718  // Fall through.
719  default:
720  state = FIRST_NUM;
721  break;
722  }
723  } else {
724  if (*lengths == 1 && *str == ' ') {
725  tess_rejs++;
726  } else {
727  bad_char_count++;
728  }
729  switch (state) {
730  case FIRST_NUM:
731  isolated_digits++;
732  break;
733  case FIRST_UPPER:
734  case FIRST_LOWER:
735  isolated_alphas++;
736  default:
737  break;
738  }
739  state = JUNK;
740  }
741  }
742 
743  switch (state) {
744  case FIRST_NUM:
745  isolated_digits++;
746  break;
747  case FIRST_UPPER:
748  case FIRST_LOWER:
749  isolated_alphas++;
750  default:
751  break;
752  }
753 
754  if (crunch_include_numerals) {
755  total_alpha_count += total_digit_count - isolated_digits;
756  }
757 
758  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
759  longest_alpha_repetition_count < crunch_long_repetitions) {
760  if ((crunch_accept_ok &&
761  acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
762  longest_lower_run_len > crunch_leave_lc_strings ||
763  longest_upper_run_len > crunch_leave_uc_strings) {
764  return G_NEVER_CRUNCH;
765  }
766  }
767  if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
768  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
769  word->best_choice->permuter() == FREQ_DAWG_PERM ||
770  word->best_choice->permuter() == USER_DAWG_PERM ||
771  word->best_choice->permuter() == NUMBER_PERM ||
772  acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
773  return G_OK;
774  }
775 
776  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
777 
778  if (crunch_debug > 3) {
779  tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
780  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
781  isolated_digits, isolated_alphas, tess_rejs);
782  }
783  if (bad_char_count == 0 && tess_rejs == 0 &&
784  (len > isolated_digits + isolated_alphas || len <= 2)) {
785  return G_OK;
786  }
787 
788  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
789  return G_TERRIBLE;
790  }
791 
792  if (len > 4) {
793  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
794  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
795  return G_DODGY;
796  } else {
797  return G_OK;
798  }
799  } else {
800  dodgy_chars = 2 * tess_rejs + bad_char_count;
801  if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
802  return G_DODGY;
803  } else {
804  return G_OK;
805  }
806  }
807 }
808 
809 /*************************************************************************
810  * word_deletable()
811  * DELETE WERDS AT ENDS OF ROWS IF
812  * Word is crunched &&
813  * ( string length = 0 OR
814  * > 50% of chars are "|" (before merging) OR
815  * certainty < -10 OR
816  * rating /char > 60 OR
817  * TOP of word is more than 0.5 xht BELOW baseline OR
818  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
819  * length of word < 3xht OR
820  * height of word < 0.7 xht OR
821  * height of word > 3.0 xht OR
822  * >75% of the outline BBs have longest dimension < 0.5xht
823  *************************************************************************/
824 
825 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
826  int word_len = word->reject_map.length();
827  float rating_per_ch;
828  TBOX box; // BB of word
829 
830  if (word->unlv_crunch_mode == CR_NONE) {
831  delete_mode = 0;
832  return CR_NONE;
833  }
834 
835  if (word_len == 0) {
836  delete_mode = 1;
837  return CR_DELETE;
838  }
839 
840  if (word->rebuild_word != nullptr) {
841  // Cube leaves rebuild_word nullptr.
842  box = word->rebuild_word->bounding_box();
843  if (box.height() < crunch_del_min_ht * kBlnXHeight) {
844  delete_mode = 4;
845  return CR_DELETE;
846  }
847 
848  if (noise_outlines(word->rebuild_word)) {
849  delete_mode = 5;
850  return CR_DELETE;
851  }
852  }
853 
854  if ((failure_count(word) * 1.5) > word_len) {
855  delete_mode = 2;
856  return CR_LOOSE_SPACE;
857  }
858 
859  if (word->best_choice->certainty() < crunch_del_cert) {
860  delete_mode = 7;
861  return CR_LOOSE_SPACE;
862  }
863 
864  rating_per_ch = word->best_choice->rating() / word_len;
865 
866  if (rating_per_ch > crunch_del_rating) {
867  delete_mode = 8;
868  return CR_LOOSE_SPACE;
869  }
870 
871  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
872  delete_mode = 9;
873  return CR_LOOSE_SPACE;
874  }
875 
876  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
877  delete_mode = 10;
878  return CR_LOOSE_SPACE;
879  }
880 
881  if (box.height() > crunch_del_max_ht * kBlnXHeight) {
882  delete_mode = 11;
883  return CR_LOOSE_SPACE;
884  }
885 
886  if (box.width() < crunch_del_min_width * kBlnXHeight) {
887  delete_mode = 3;
888  return CR_LOOSE_SPACE;
889  }
890 
891  delete_mode = 0;
892  return CR_NONE;
893 }
894 
896  const char *str = word->best_choice->unichar_string().c_str();
897  int tess_rejs = 0;
898 
899  for (; *str != '\0'; str++) {
900  if (*str == ' ') {
901  tess_rejs++;
902  }
903  }
904  return tess_rejs;
905 }
906 
908  TBOX box; // BB of outline
909  int16_t outline_count = 0;
910  int16_t small_outline_count = 0;
911  int16_t max_dimension;
912  float small_limit = kBlnXHeight * crunch_small_outlines_size;
913 
914  for (unsigned b = 0; b < word->NumBlobs(); ++b) {
915  TBLOB *blob = word->blobs[b];
916  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
917  outline_count++;
918  box = ol->bounding_box();
919  if (box.height() > box.width()) {
920  max_dimension = box.height();
921  } else {
922  max_dimension = box.width();
923  }
924  if (max_dimension < small_limit) {
925  small_outline_count++;
926  }
927  }
928  }
929  return small_outline_count >= outline_count;
930 }
931 
932 } // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kBlnXHeight
Definition: normalis.h:33
int UNICHAR_ID
Definition: unichar.h:36
GARBAGE_LEVEL
Definition: docqual.h:30
@ G_TERRIBLE
Definition: docqual.h:30
@ G_NEVER_CRUNCH
Definition: docqual.h:30
@ G_OK
Definition: docqual.h:30
@ G_DODGY
Definition: docqual.h:30
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ NUMBER_PERM
Definition: ratngs.h:238
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:363
const int kBlnBaselineOffset
Definition: normalis.h:34
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:530
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:616
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:373
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:98
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:210
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1704
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:907
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:895
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:594
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1811
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:488
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:450
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:825
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:107
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:142
TESSLINE * next
Definition: blobs.h:288
TESSLINE * outlines
Definition: blobs.h:404
int NumOutlines() const
Definition: blobs.cpp:452
TBOX bounding_box() const
Definition: blobs.cpp:863
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
void ProcessMatchedBlobs(const TWERD &other, const std::function< void(int)> &cb) const
Definition: boxword.cpp:201
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int32_t rej_count
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:121
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
WERD_CHOICE * best_choice
Definition: pageres.h:239
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193
const UNICHARSET * uch_set
Definition: pageres.h:201
TWERD * rebuild_word
Definition: pageres.h:264
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * restart_page()
Definition: pageres.h:710
ROW_RES * prev_row() const
Definition: pageres.h:757
WERD_RES * forward()
Definition: pageres.h:743
ROW_RES * row() const
Definition: pageres.h:766
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
int index() const
Definition: pdblock.h:77
bool IsText() const
Definition: polyblk.h:52
float certainty() const
Definition: ratngs.h:311
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
uint8_t permuter() const
Definition: ratngs.h:327
const std::string & unichar_lengths() const
Definition: ratngs.h:529
float rating() const
Definition: ratngs.h:308
std::string & unichar_string()
Definition: ratngs.h:515
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension bottom() const
Definition: rect.h:75
int16_t reject_count() const
Definition: rejctmap.h:339
void rej_word_row_rej()
Definition: rejctmap.cpp:211
int16_t accept_count() const
Definition: rejctmap.cpp:72
uint16_t length() const
Definition: rejctmap.h:333
void rej_word_block_rej()
Definition: rejctmap.cpp:203
bool quality_recoverable_rejects() const
Definition: rejctmap.cpp:91
void rej_word_doc_rej()
Definition: rejctmap.cpp:195
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
uint8_t space() const
Definition: werd.h:100
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186