tesseract  5.0.0
tesseract::WERD_RES Class Reference

#include <pageres.h>

Inheritance diagram for tesseract::WERD_RES:
tesseract::ELIST_LINK

Public Member Functions

 WERD_RES ()=default
 
 WERD_RES (WERD *the_word)
 
 WERD_RES (const WERD_RES &source)
 
 ~WERD_RES ()
 
const char * BestUTF8 (unsigned blob_index, bool in_rtl_context) const
 
const char * RawUTF8 (unsigned blob_index) const
 
UNICHARSET::Direction SymbolDirection (unsigned blob_index) const
 
bool AnyRtlCharsInWord () const
 
bool AnyLtrCharsInWord () const
 
bool UnicharsInReadingOrder () const
 
void Clear ()
 
void ClearResults ()
 
void ClearWordChoices ()
 
void ClearRatings ()
 
WERD_RESoperator= (const WERD_RES &source)
 
void CopySimpleFields (const WERD_RES &source)
 
void InitForRetryRecognition (const WERD_RES &source)
 
bool SetupForRecognition (const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
 
void SetupBasicsFromChoppedWord (const UNICHARSET &unicharset_in)
 
void SetupFake (const UNICHARSET &uch)
 
void SetupWordScript (const UNICHARSET &unicharset_in)
 
void SetupBlamerBundle ()
 
void SetupBlobWidthsAndGaps ()
 
void InsertSeam (int blob_number, SEAM *seam)
 
bool AlternativeChoiceAdjustmentsWorseThan (float threshold) const
 
bool IsAmbiguous ()
 
bool StatesAllValid ()
 
void DebugWordChoices (bool debug, const char *word_to_debug)
 
void DebugTopChoice (const char *msg) const
 
void FilterWordChoices (int debug_level)
 
void ComputeAdaptionThresholds (float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
 
bool LogNewRawChoice (WERD_CHOICE *word_choice)
 
bool LogNewCookedChoice (int max_num_choices, bool debug, WERD_CHOICE *word_choice)
 
void PrintBestChoices () const
 
int GetBlobsWidth (int start_blob, int last_blob) const
 
int GetBlobsGap (unsigned blob_index) const
 
BLOB_CHOICEGetBlobChoice (unsigned index) const
 
BLOB_CHOICE_LIST * GetBlobChoices (int index) const
 
void ConsumeWordResults (WERD_RES *word)
 
void ReplaceBestChoice (WERD_CHOICE *choice)
 
void RebuildBestState ()
 
void CloneChoppedToRebuild ()
 
void SetupBoxWord ()
 
void SetScriptPositions ()
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
void FakeClassifyWord (unsigned blob_count, BLOB_CHOICE **choices)
 
void FakeWordFromRatings (PermuterType permuter)
 
void BestChoiceToCorrectText ()
 
bool ConditionalBlobMerge (const std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, const std::function< bool(const TBOX &, const TBOX &)> &box_cb)
 
void MergeAdjacentBlobs (unsigned index)
 
UNICHAR_ID BothQuotes (UNICHAR_ID id1, UNICHAR_ID id2)
 
void fix_quotes ()
 
UNICHAR_ID BothHyphens (UNICHAR_ID id1, UNICHAR_ID id2)
 
bool HyphenBoxesOverlap (const TBOX &box1, const TBOX &box2)
 
void fix_hyphens ()
 
UNICHAR_ID BothSpaces (UNICHAR_ID id1, UNICHAR_ID id2)
 
void merge_tess_fails ()
 
void copy_on (WERD_RES *word_res)
 
bool PiecesAllNatural (int start, int count) const
 
- Public Member Functions inherited from tesseract::ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static WERD_RESdeep_copy (const WERD_RES *src)
 

Public Attributes

WERDword = nullptr
 
tesseract::BoxWordbln_boxes = nullptr
 
ROWblob_row = nullptr
 
DENORM denorm
 
const UNICHARSETuch_set = nullptr
 
TWERDchopped_word = nullptr
 
std::vector< SEAM * > seam_array
 
std::vector< int > blob_widths
 
std::vector< int > blob_gaps
 
std::vector< std::vector< std::pair< const char *, float > > > timesteps
 
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
 
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
 
bool leading_space = false
 
int end = 0
 
MATRIXratings = nullptr
 
WERD_CHOICEbest_choice = nullptr
 
WERD_CHOICEraw_choice = nullptr
 
WERD_CHOICE_LIST best_choices
 
BlamerBundleblamer_bundle = nullptr
 
TWERDrebuild_word = nullptr
 
tesseract::BoxWordbox_word = nullptr
 
tesseract::Tesseracttesseract = nullptr
 
std::vector< int > best_state
 
std::vector< std::string > correct_text
 
WERD_CHOICEep_choice = nullptr
 
REJMAP reject_map
 
bool tess_failed = false
 
bool tess_accepted = false
 
bool tess_would_adapt = false
 
bool done = false
 
bool small_caps = false
 
bool odd_size = false
 
const FontInfofontinfo = nullptr
 
const FontInfofontinfo2 = nullptr
 
int8_t fontinfo_id_count = 0
 
int8_t fontinfo_id2_count = 0
 
bool guessed_x_ht = true
 
bool guessed_caps_ht = true
 
CRUNCH_MODE unlv_crunch_mode = CR_NONE
 
float x_height = 0.0f
 
float caps_height = 0.0f
 
float baseline_shift = 0.0f
 
float space_certainty = 0.0f
 
bool combination = false
 
bool part_of_combo = false
 
bool reject_spaces = false
 

Detailed Description

Definition at line 164 of file pageres.h.

Constructor & Destructor Documentation

◆ WERD_RES() [1/3]

tesseract::WERD_RES::WERD_RES ( )
default

◆ WERD_RES() [2/3]

tesseract::WERD_RES::WERD_RES ( WERD the_word)
inline

Definition at line 343 of file pageres.h.

343  {
344  word = the_word;
345  }

◆ WERD_RES() [3/3]

tesseract::WERD_RES::WERD_RES ( const WERD_RES source)
inline

Definition at line 348 of file pageres.h.

348  : ELIST_LINK(source) {
349  // combination is used in function Clear which is called from operator=.
350  combination = false;
351  *this = source; // see operator=
352  }

◆ ~WERD_RES()

tesseract::WERD_RES::~WERD_RES ( )

Definition at line 1124 of file pageres.cpp.

1124  {
1125  Clear();
1126 }

Member Function Documentation

◆ AlternativeChoiceAdjustmentsWorseThan()

bool tesseract::WERD_RES::AlternativeChoiceAdjustmentsWorseThan ( float  threshold) const

Definition at line 441 of file pageres.cpp.

441  {
442  // The choices are not changed by this iteration.
443  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
444  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
445  WERD_CHOICE *choice = wc_it.data();
446  if (choice->adjust_factor() <= threshold) {
447  return false;
448  }
449  }
450  return true;
451 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247

◆ AnyLtrCharsInWord()

bool tesseract::WERD_RES::AnyLtrCharsInWord ( ) const
inline

Definition at line 413 of file pageres.h.

413  {
414  if (uch_set == nullptr || best_choice == nullptr ||
415  best_choice->length() < 1) {
416  return false;
417  }
418  for (unsigned id = 0; id < best_choice->length(); id++) {
419  unsigned unichar_id = best_choice->unichar_id(id);
420  if (unichar_id >= uch_set->size()) {
421  continue; // Ignore illegal chars.
422  }
423  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
424  if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
426  return true;
427  }
428  }
429  return false;
430  }
WERD_CHOICE * best_choice
Definition: pageres.h:239
const UNICHARSET * uch_set
Definition: pageres.h:201
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
unsigned length() const
Definition: ratngs.h:283
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713
size_t size() const
Definition: unicharset.h:355

◆ AnyRtlCharsInWord()

bool tesseract::WERD_RES::AnyRtlCharsInWord ( ) const
inline

Definition at line 394 of file pageres.h.

394  {
395  if (uch_set == nullptr || best_choice == nullptr ||
396  best_choice->length() < 1) {
397  return false;
398  }
399  for (unsigned id = 0; id < best_choice->length(); id++) {
400  unsigned unichar_id = best_choice->unichar_id(id);
401  if (unichar_id >= uch_set->size()) {
402  continue; // Ignore illegal chars.
403  }
404  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
405  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
407  return true;
408  }
409  }
410  return false;
411  }

◆ BestChoiceToCorrectText()

void tesseract::WERD_RES::BestChoiceToCorrectText ( )

Definition at line 956 of file pageres.cpp.

956  {
957  correct_text.clear();
958  ASSERT_HOST(best_choice != nullptr);
959  for (unsigned i = 0; i < best_choice->length(); ++i) {
960  UNICHAR_ID choice_id = best_choice->unichar_id(i);
961  const char *blob_choice = uch_set->id_to_unichar(choice_id);
962  correct_text.emplace_back(blob_choice);
963  }
964 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
int UNICHAR_ID
Definition: unichar.h:36
std::vector< std::string > correct_text
Definition: pageres.h:287
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ BestUTF8()

const char* tesseract::WERD_RES::BestUTF8 ( unsigned  blob_index,
bool  in_rtl_context 
) const
inline

Definition at line 361 of file pageres.h.

361  {
362  if (best_choice == nullptr || blob_index >= best_choice->length()) {
363  return nullptr;
364  }
365  UNICHAR_ID id = best_choice->unichar_id(blob_index);
366  if (static_cast<unsigned>(id) >= uch_set->size()) {
367  return nullptr;
368  }
369  UNICHAR_ID mirrored = uch_set->get_mirror(id);
370  if (in_rtl_context && mirrored > 0) {
371  id = mirrored;
372  }
373  return uch_set->id_to_unichar_ext(id);
374  }
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287

◆ BothHyphens()

UNICHAR_ID tesseract::WERD_RES::BothHyphens ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
)

Definition at line 1059 of file pageres.cpp.

1059  {
1060  const char *ch = uch_set->id_to_unichar(id1);
1061  const char *next_ch = uch_set->id_to_unichar(id2);
1062  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&
1063  (*next_ch == '-' || *next_ch == '~')) {
1064  return uch_set->unichar_to_id("-");
1065  }
1066  return INVALID_UNICHAR_ID;
1067 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

◆ BothQuotes()

UNICHAR_ID tesseract::WERD_RES::BothQuotes ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
)

Definition at line 1036 of file pageres.cpp.

1036  {
1037  const char *ch = uch_set->id_to_unichar(id1);
1038  const char *next_ch = uch_set->id_to_unichar(id2);
1039  if (is_simple_quote(ch, strlen(ch)) &&
1040  is_simple_quote(next_ch, strlen(next_ch))) {
1041  return uch_set->unichar_to_id("\"");
1042  }
1043  return INVALID_UNICHAR_ID;
1044 }

◆ BothSpaces()

UNICHAR_ID tesseract::WERD_RES::BothSpaces ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
)

Definition at line 1090 of file pageres.cpp.

1090  {
1091  if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {
1092  return id1;
1093  } else {
1094  return INVALID_UNICHAR_ID;
1095  }
1096 }

◆ Clear()

void tesseract::WERD_RES::Clear ( )

Definition at line 1128 of file pageres.cpp.

1128  {
1129  if (combination) {
1130  delete word;
1131  }
1132  word = nullptr;
1133  delete blamer_bundle;
1134  blamer_bundle = nullptr;
1135  ClearResults();
1136 }
BlamerBundle * blamer_bundle
Definition: pageres.h:250

◆ ClearRatings()

void tesseract::WERD_RES::ClearRatings ( )

Definition at line 1175 of file pageres.cpp.

1175  {
1176  if (ratings != nullptr) {
1178  delete ratings;
1179  ratings = nullptr;
1180  }
1181 }
MATRIX * ratings
Definition: pageres.h:235

◆ ClearResults()

void tesseract::WERD_RES::ClearResults ( )

Definition at line 1138 of file pageres.cpp.

1138  {
1139  done = false;
1140  fontinfo = nullptr;
1141  fontinfo2 = nullptr;
1142  fontinfo_id_count = 0;
1143  fontinfo_id2_count = 0;
1144  delete bln_boxes;
1145  bln_boxes = nullptr;
1146  blob_row = nullptr;
1147  delete chopped_word;
1148  chopped_word = nullptr;
1149  delete rebuild_word;
1150  rebuild_word = nullptr;
1151  delete box_word;
1152  box_word = nullptr;
1153  best_state.clear();
1154  correct_text.clear();
1155  for (auto data : seam_array) {
1156  delete data;
1157  }
1158  seam_array.clear();
1159  blob_widths.clear();
1160  blob_gaps.clear();
1161  ClearRatings();
1162  ClearWordChoices();
1163  if (blamer_bundle != nullptr) {
1165  }
1166 }
const FontInfo * fontinfo2
Definition: pageres.h:308
int8_t fontinfo_id2_count
Definition: pageres.h:310
TWERD * chopped_word
Definition: pageres.h:210
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193
int8_t fontinfo_id_count
Definition: pageres.h:309
const FontInfo * fontinfo
Definition: pageres.h:307
std::vector< int > best_state
Definition: pageres.h:283
tesseract::BoxWord * box_word
Definition: pageres.h:270
std::vector< int > blob_widths
Definition: pageres.h:214
std::vector< int > blob_gaps
Definition: pageres.h:217
TWERD * rebuild_word
Definition: pageres.h:264
std::vector< SEAM * > seam_array
Definition: pageres.h:212

◆ ClearWordChoices()

void tesseract::WERD_RES::ClearWordChoices ( )

Definition at line 1167 of file pageres.cpp.

1167  {
1168  best_choice = nullptr;
1169  delete raw_choice;
1170  raw_choice = nullptr;
1171  best_choices.clear();
1172  delete ep_choice;
1173  ep_choice = nullptr;
1174 }
WERD_CHOICE * raw_choice
Definition: pageres.h:244
WERD_CHOICE * ep_choice
Definition: pageres.h:291

◆ CloneChoppedToRebuild()

void tesseract::WERD_RES::CloneChoppedToRebuild ( )

Definition at line 865 of file pageres.cpp.

865  {
866  delete rebuild_word;
867  rebuild_word = new TWERD(*chopped_word);
868  SetupBoxWord();
869  auto word_len = box_word->length();
870  best_state.reserve(word_len);
871  correct_text.reserve(word_len);
872  for (unsigned i = 0; i < word_len; ++i) {
873  best_state.push_back(1);
874  correct_text.emplace_back("");
875  }
876 }
unsigned length() const
Definition: boxword.h:81

◆ ComputeAdaptionThresholds()

void tesseract::WERD_RES::ComputeAdaptionThresholds ( float  certainty_scale,
float  min_rating,
float  max_rating,
float  rating_margin,
float *  thresholds 
)

Definition at line 570 of file pageres.cpp.

573  {
574  int chunk = 0;
575  int end_chunk = best_choice->state(0);
576  int end_raw_chunk = raw_choice->state(0);
577  int raw_blob = 0;
578  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
579  float avg_rating = 0.0f;
580  int num_error_chunks = 0;
581 
582  // For each chunk in best choice blob i, count non-matching raw results.
583  while (chunk < end_chunk) {
584  if (chunk >= end_raw_chunk) {
585  ++raw_blob;
586  end_raw_chunk += raw_choice->state(raw_blob);
587  }
588  if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
589  avg_rating += raw_choice->certainty(raw_blob);
590  ++num_error_chunks;
591  }
592  ++chunk;
593  }
594 
595  if (num_error_chunks > 0) {
596  avg_rating /= num_error_chunks;
597  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
598  } else {
599  *thresholds = max_rating;
600  }
601 
602  if (*thresholds > max_rating) {
603  *thresholds = max_rating;
604  }
605  if (*thresholds < min_rating) {
606  *thresholds = min_rating;
607  }
608  }
609 }
float certainty() const
Definition: ratngs.h:311
unsigned state(unsigned index) const
Definition: ratngs.h:299

◆ ConditionalBlobMerge()

bool tesseract::WERD_RES::ConditionalBlobMerge ( const std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &  class_cb,
const std::function< bool(const TBOX &, const TBOX &)> &  box_cb 
)

Definition at line 971 of file pageres.cpp.

973  {
974  ASSERT_HOST(best_choice->empty() || ratings != nullptr);
975  bool modified = false;
976  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
977  UNICHAR_ID new_id =
978  class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
979  if (new_id != INVALID_UNICHAR_ID &&
980  (box_cb == nullptr ||
981  box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
982  // Raw choice should not be fixed.
983  best_choice->set_unichar_id(new_id, i);
984  modified = true;
986  const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
987  if (!coord.Valid(*ratings)) {
988  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
989  }
990  BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
991  if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
992  // Insert a fake result.
993  auto *blob_choice = new BLOB_CHOICE;
994  blob_choice->set_unichar_id(new_id);
995  BLOB_CHOICE_IT bc_it(blob_choices);
996  bc_it.add_before_then_move(blob_choice);
997  }
998  }
999  }
1000  return modified;
1001 }
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:52
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:779
void MergeAdjacentBlobs(unsigned index)
Definition: pageres.cpp:1005
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340
bool empty() const
Definition: ratngs.h:280

◆ ConsumeWordResults()

void tesseract::WERD_RES::ConsumeWordResults ( WERD_RES word)

Definition at line 785 of file pageres.cpp.

785  {
786  denorm = word->denorm;
787  blob_row = word->blob_row;
788  MovePointerData(&chopped_word, &word->chopped_word);
789  MovePointerData(&rebuild_word, &word->rebuild_word);
790  MovePointerData(&box_word, &word->box_word);
791  for (auto data : seam_array) {
792  delete data;
793  }
794  seam_array = word->seam_array;
795  word->seam_array.clear();
796  // TODO: optimize moves.
797  best_state = word->best_state;
798  word->best_state.clear();
799  correct_text = word->correct_text;
800  word->correct_text.clear();
801  blob_widths = word->blob_widths;
802  word->blob_widths.clear();
803  blob_gaps = word->blob_gaps;
804  word->blob_gaps.clear();
805  if (ratings != nullptr) {
807  }
808  MovePointerData(&ratings, &word->ratings);
809  best_choice = word->best_choice;
810  MovePointerData(&raw_choice, &word->raw_choice);
811  best_choices.clear();
812  WERD_CHOICE_IT wc_it(&best_choices);
813  wc_it.add_list_after(&word->best_choices);
814  reject_map = word->reject_map;
815  if (word->blamer_bundle != nullptr) {
816  assert(blamer_bundle != nullptr);
817  blamer_bundle->CopyResults(*(word->blamer_bundle));
818  }
820 }
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:220
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:253

◆ copy_on()

void tesseract::WERD_RES::copy_on ( WERD_RES word_res)
inline

Definition at line 667 of file pageres.h.

667  { // from this word
668  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
669  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
670  word->copy_on(word_res->word);
671  }
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
void copy_on(WERD *other)
Definition: werd.cpp:230

◆ CopySimpleFields()

void tesseract::WERD_RES::CopySimpleFields ( const WERD_RES source)

Definition at line 253 of file pageres.cpp.

253  {
254  tess_failed = source.tess_failed;
255  tess_accepted = source.tess_accepted;
256  tess_would_adapt = source.tess_would_adapt;
257  done = source.done;
258  unlv_crunch_mode = source.unlv_crunch_mode;
259  small_caps = source.small_caps;
260  odd_size = source.odd_size;
261  fontinfo = source.fontinfo;
262  fontinfo2 = source.fontinfo2;
263  fontinfo_id_count = source.fontinfo_id_count;
264  fontinfo_id2_count = source.fontinfo_id2_count;
265  x_height = source.x_height;
266  caps_height = source.caps_height;
267  baseline_shift = source.baseline_shift;
268  guessed_x_ht = source.guessed_x_ht;
269  guessed_caps_ht = source.guessed_caps_ht;
270  reject_spaces = source.reject_spaces;
271  uch_set = source.uch_set;
272  tesseract = source.tesseract;
273 }
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
float baseline_shift
Definition: pageres.h:316

◆ DebugTopChoice()

void tesseract::WERD_RES::DebugTopChoice ( const char *  msg) const

Definition at line 503 of file pageres.cpp.

503  {
504  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
506  if (best_choice == nullptr) {
507  tprintf("<Null choice>\n");
508  } else {
509  best_choice->print(msg);
510  }
511 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void print() const
Definition: ratngs.h:557

◆ DebugWordChoices()

void tesseract::WERD_RES::DebugWordChoices ( bool  debug,
const char *  word_to_debug 
)

Definition at line 483 of file pageres.cpp.

483  {
484  if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
485  best_choice != nullptr &&
486  best_choice->unichar_string() == std::string(word_to_debug))) {
487  if (raw_choice != nullptr) {
488  raw_choice->print("\nBest Raw Choice");
489  }
490 
491  WERD_CHOICE_IT it(&best_choices);
492  int index = 0;
493  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
494  WERD_CHOICE *choice = it.data();
495  std::string label;
496  label += "\nCooked Choice #" + std::to_string(index);
497  choice->print(label.c_str());
498  }
499  }
500 }
std::string & unichar_string()
Definition: ratngs.h:515

◆ deep_copy()

static WERD_RES* tesseract::WERD_RES::deep_copy ( const WERD_RES src)
inlinestatic

Definition at line 655 of file pageres.h.

655  {
656  auto *result = new WERD_RES(*src);
657  // That didn't copy the ratings, but we want a copy if there is one to
658  // begin with.
659  if (src->ratings != nullptr) {
660  result->ratings = src->ratings->DeepCopy();
661  }
662  return result;
663  }

◆ FakeClassifyWord()

void tesseract::WERD_RES::FakeClassifyWord ( unsigned  blob_count,
BLOB_CHOICE **  choices 
)

Definition at line 908 of file pageres.cpp.

908  {
909  // Setup the WERD_RES.
910  ASSERT_HOST(box_word != nullptr);
911  ASSERT_HOST(blob_count == box_word->length());
913  ClearRatings();
914  ratings = new MATRIX(blob_count, 1);
915  for (unsigned c = 0; c < blob_count; ++c) {
916  auto *choice_list = new BLOB_CHOICE_LIST;
917  BLOB_CHOICE_IT choice_it(choice_list);
918  choice_it.add_after_then_move(choices[c]);
919  ratings->put(c, c, choice_list);
920  }
922  reject_map.initialise(blob_count);
923  best_state.clear();
924  best_state.resize(blob_count, 1);
925  done = true;
926 }
@ TOP_CHOICE_PERM
Definition: ratngs.h:234
void put(ICOORD pos, const T &thing)
Definition: matrix.h:260
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:930
void initialise(uint16_t length)
Definition: rejctmap.cpp:67

◆ FakeWordFromRatings()

void tesseract::WERD_RES::FakeWordFromRatings ( PermuterType  permuter)

Definition at line 930 of file pageres.cpp.

930  {
931  int num_blobs = ratings->dimension();
932  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
933  word_choice->set_permuter(permuter);
934  for (int b = 0; b < num_blobs; ++b) {
935  UNICHAR_ID unichar_id = UNICHAR_SPACE;
936  // Initialize rating and certainty like in WERD_CHOICE::make_bad().
937  float rating = WERD_CHOICE::kBadRating;
938  float certainty = -FLT_MAX;
939  BLOB_CHOICE_LIST *choices = ratings->get(b, b);
940  if (choices != nullptr && !choices->empty()) {
941  BLOB_CHOICE_IT bc_it(choices);
942  BLOB_CHOICE *choice = bc_it.data();
943  unichar_id = choice->unichar_id();
944  rating = choice->rating();
945  certainty = choice->certainty();
946  }
947  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
948  certainty);
949  }
950  LogNewRawChoice(word_choice);
951  // Ownership of word_choice taken by word here.
952  LogNewCookedChoice(1, false, word_choice);
953 }
@ UNICHAR_SPACE
Definition: unicharset.h:36
T get(ICOORD pos) const
Definition: matrix.h:268
int dimension() const
Definition: matrix.h:612
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:629
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:613
static const float kBadRating
Definition: ratngs.h:256

◆ FilterWordChoices()

void tesseract::WERD_RES::FilterWordChoices ( int  debug_level)

Definition at line 518 of file pageres.cpp.

518  {
519  if (best_choice == nullptr || best_choices.singleton()) {
520  return;
521  }
522 
523  if (debug_level >= 2) {
524  best_choice->print("\nFiltering against best choice");
525  }
526  WERD_CHOICE_IT it(&best_choices);
527  int index = 0;
528  for (it.forward(); !it.at_first(); it.forward(), ++index) {
529  WERD_CHOICE *choice = it.data();
530  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
531  choice->adjust_factor());
532  // i, j index the blob choice in choice, best_choice.
533  // chunk is an index into the chopped_word blobs (AKA chunks).
534  // Since the two words may use different segmentations of the chunks, we
535  // iterate over the chunks to find out whether a comparable blob
536  // classification is much worse than the best result.
537  unsigned i = 0, j = 0, chunk = 0;
538  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
539  // and best_chunk are the indices of the first chunk in the NEXT blob,
540  // i.e. we don't have to increment i, j while chunk < choice_chunk and
541  // best_chunk respectively.
542  auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
543  while (i < choice->length() && j < best_choice->length()) {
544  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
545  choice->certainty(i) - best_choice->certainty(j) < threshold) {
546  if (debug_level >= 2) {
547  choice->print("WorstCertaintyDiffWorseThan");
548  tprintf(
549  "i %u j %u Choice->Blob[i].Certainty %.4g"
550  " WorstOtherChoiceCertainty %g Threshold %g\n",
551  i, j, choice->certainty(i), best_choice->certainty(j), threshold);
552  tprintf("Discarding bad choice #%d\n", index);
553  }
554  delete it.extract();
555  break;
556  }
557  ++chunk;
558  // If needed, advance choice_chunk to keep up with chunk.
559  while (choice_chunk < chunk && ++i < choice->length()) {
560  choice_chunk += choice->state(i);
561  }
562  // If needed, advance best_chunk to keep up with chunk.
563  while (best_chunk < chunk && ++j < best_choice->length()) {
564  best_chunk += best_choice->state(j);
565  }
566  }
567  }
568 }
float adjust_factor() const
Definition: ratngs.h:286

◆ fix_hyphens()

void tesseract::WERD_RES::fix_hyphens ( )

Definition at line 1077 of file pageres.cpp.

1077  {
1078  if (!uch_set->contains_unichar("-") ||
1080  return; // Don't create it if it is disallowed.
1081  }
1082 
1083  using namespace std::placeholders; // for _1, _2
1084  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
1085  std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
1086 }
bool ConditionalBlobMerge(const std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, const std::function< bool(const TBOX &, const TBOX &)> &box_cb)
Definition: pageres.cpp:971
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1071
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:912

◆ fix_quotes()

void tesseract::WERD_RES::fix_quotes ( )

Definition at line 1047 of file pageres.cpp.

1047  {
1048  if (!uch_set->contains_unichar("\"") ||
1050  return; // Don't create it if it is disallowed.
1051  }
1052 
1053  using namespace std::placeholders; // for _1, _2
1054  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);
1055 }
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1036

◆ GetBlobChoice()

BLOB_CHOICE * tesseract::WERD_RES::GetBlobChoice ( unsigned  index) const

Definition at line 768 of file pageres.cpp.

768  {
769  if (index >= best_choice->length()) {
770  return nullptr;
771  }
772  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
773  return FindMatchingChoice(best_choice->unichar_id(index), choices);
774 }

◆ GetBlobChoices()

BLOB_CHOICE_LIST * tesseract::WERD_RES::GetBlobChoices ( int  index) const

Definition at line 779 of file pageres.cpp.

779  {
780  return best_choice->blob_choices(index, ratings);
781 }
BLOB_CHOICE_LIST * blob_choices(unsigned index, MATRIX *ratings) const
Definition: ratngs.cpp:274

◆ GetBlobsGap()

int tesseract::WERD_RES::GetBlobsGap ( unsigned  blob_index) const

Definition at line 757 of file pageres.cpp.

757  {
758  if (blob_index >= blob_gaps.size()) {
759  return 0;
760  }
761  return blob_gaps[blob_index];
762 }

◆ GetBlobsWidth()

int tesseract::WERD_RES::GetBlobsWidth ( int  start_blob,
int  last_blob 
) const

Definition at line 746 of file pageres.cpp.

746  {
747  int result = 0;
748  for (int b = start_blob; b <= last_blob; ++b) {
749  result += blob_widths[b];
750  if (b < last_blob) {
751  result += blob_gaps[b];
752  }
753  }
754  return result;
755 }

◆ HyphenBoxesOverlap()

bool tesseract::WERD_RES::HyphenBoxesOverlap ( const TBOX box1,
const TBOX box2 
)

Definition at line 1071 of file pageres.cpp.

1071  {
1072  return box1.right() >= box2.left();
1073 }

◆ InitForRetryRecognition()

void tesseract::WERD_RES::InitForRetryRecognition ( const WERD_RES source)

Definition at line 279 of file pageres.cpp.

279  {
280  word = source.word;
281  CopySimpleFields(source);
282  if (source.blamer_bundle != nullptr) {
283  blamer_bundle = new BlamerBundle();
284  blamer_bundle->CopyTruth(*source.blamer_bundle);
285  }
286 }
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214

◆ InsertSeam()

void tesseract::WERD_RES::InsertSeam ( int  blob_number,
SEAM seam 
)

Definition at line 419 of file pageres.cpp.

419  {
420  // Insert the seam into the SEAMS array.
421  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
422  seam_array.insert(seam_array.begin() + blob_number, seam);
423  if (ratings != nullptr) {
424  // Expand the ratings matrix.
425  ratings = ratings->ConsumeAndMakeBigger(blob_number);
426  // Fix all the segmentation states.
427  if (raw_choice != nullptr) {
428  raw_choice->UpdateStateForSplit(blob_number);
429  }
430  WERD_CHOICE_IT wc_it(&best_choices);
431  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
432  WERD_CHOICE *choice = wc_it.data();
433  choice->UpdateStateForSplit(blob_number);
434  }
436  }
437 }
std::vector< TBLOB * > blobs
Definition: blobs.h:462
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:61
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:401
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:664

◆ IsAmbiguous()

bool tesseract::WERD_RES::IsAmbiguous ( )

Definition at line 455 of file pageres.cpp.

455  {
456  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
457 }
bool dangerous_ambig_found() const
Definition: ratngs.h:344

◆ LogNewCookedChoice()

bool tesseract::WERD_RES::LogNewCookedChoice ( int  max_num_choices,
bool  debug,
WERD_CHOICE word_choice 
)

Definition at line 629 of file pageres.cpp.

630  {
631  if (best_choice != nullptr) {
632  // Throw out obviously bad choices to save some work.
633  // TODO(rays) Get rid of this! This piece of code produces different
634  // results according to the order in which words are found, which is an
635  // undesirable behavior. It would be better to keep all the choices and
636  // prune them later when more information is available.
637  float max_certainty_delta = StopperAmbigThreshold(
638  best_choice->adjust_factor(), word_choice->adjust_factor());
639  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
640  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
641  }
642  if (word_choice->certainty() - best_choice->certainty() <
643  max_certainty_delta) {
644  if (debug) {
645  std::string bad_string;
646  word_choice->string_and_lengths(&bad_string, nullptr);
647  tprintf(
648  "Discarding choice \"%s\" with an overly low certainty"
649  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
650  bad_string.c_str(), word_choice->certainty(),
652  max_certainty_delta + best_choice->certainty());
653  }
654  delete word_choice;
655  return false;
656  }
657  }
658 
659  // Insert in the list in order of increasing rating, but knock out worse
660  // string duplicates.
661  WERD_CHOICE_IT it(&best_choices);
662  const std::string &new_str = word_choice->unichar_string();
663  bool inserted = false;
664  int num_choices = 0;
665  if (!it.empty()) {
666  do {
667  WERD_CHOICE *choice = it.data();
668  if (choice->rating() > word_choice->rating() && !inserted) {
669  // Time to insert.
670  it.add_before_stay_put(word_choice);
671  inserted = true;
672  if (num_choices == 0) {
673  best_choice = word_choice; // This is the new best.
674  }
675  ++num_choices;
676  }
677  if (choice->unichar_string() == new_str) {
678  if (inserted) {
679  // New is better.
680  delete it.extract();
681  } else {
682  // Old is better.
683  if (debug) {
684  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
685  new_str.c_str(), word_choice->rating(), choice->rating());
686  }
687  delete word_choice;
688  return false;
689  }
690  } else {
691  ++num_choices;
692  if (num_choices > max_num_choices) {
693  delete it.extract();
694  }
695  }
696  it.forward();
697  } while (!it.at_first());
698  }
699  if (!inserted && num_choices < max_num_choices) {
700  it.add_to_end(word_choice);
701  inserted = true;
702  if (num_choices == 0) {
703  best_choice = word_choice; // This is the new best.
704  }
705  }
706  if (debug) {
707  if (inserted) {
708  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
709  } else {
710  tprintf("Poor");
711  }
712  word_choice->print(" Word Choice");
713  }
714  if (!inserted) {
715  delete word_choice;
716  return false;
717  }
718  return true;
719 }

◆ LogNewRawChoice()

bool tesseract::WERD_RES::LogNewRawChoice ( WERD_CHOICE word_choice)

Definition at line 613 of file pageres.cpp.

613  {
614  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
615  delete raw_choice;
616  raw_choice = new WERD_CHOICE(*word_choice);
618  return true;
619  }
620  return false;
621 }
void set_permuter(uint8_t perm)
Definition: ratngs.h:356
float rating() const
Definition: ratngs.h:308

◆ merge_tess_fails()

void tesseract::WERD_RES::merge_tess_fails ( )

Definition at line 1099 of file pageres.cpp.

1099  {
1100  using namespace std::placeholders; // for _1, _2
1101  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),
1102  nullptr)) {
1103  unsigned len = best_choice->length();
1104  ASSERT_HOST(reject_map.length() == len);
1105  ASSERT_HOST(box_word->length() == len);
1106  }
1107 }
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1090
uint16_t length() const
Definition: rejctmap.h:333

◆ MergeAdjacentBlobs()

void tesseract::WERD_RES::MergeAdjacentBlobs ( unsigned  index)

Definition at line 1005 of file pageres.cpp.

1005  {
1006  if (reject_map.length() == best_choice->length()) {
1007  reject_map.remove_pos(index);
1008  }
1009  best_choice->remove_unichar_id(index + 1);
1010  rebuild_word->MergeBlobs(index, index + 2);
1011  box_word->MergeBoxes(index, index + 2);
1012  if (index + 1 < best_state.size()) {
1013  best_state[index] += best_state[index + 1];
1014  best_state.erase(best_state.begin() + index + 1);
1015  }
1016 }
void MergeBlobs(unsigned start, unsigned end)
Definition: blobs.cpp:874
void MergeBoxes(unsigned start, unsigned end)
Definition: boxword.cpp:138
void remove_unichar_id(unsigned index)
Definition: ratngs.h:454
void remove_pos(uint16_t pos)
Definition: rejctmap.cpp:100

◆ operator=()

WERD_RES & tesseract::WERD_RES::operator= ( const WERD_RES source)

Definition at line 186 of file pageres.cpp.

186  {
187  this->ELIST_LINK::operator=(source);
188  Clear();
189  if (source.combination) {
190  word = new WERD;
191  *word = *(source.word); // deep copy
192  } else {
193  word = source.word; // pt to same word
194  }
195  if (source.bln_boxes != nullptr) {
196  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
197  }
198  if (source.chopped_word != nullptr) {
199  chopped_word = new TWERD(*source.chopped_word);
200  }
201  if (source.rebuild_word != nullptr) {
202  rebuild_word = new TWERD(*source.rebuild_word);
203  }
204  // TODO(rays) Do we ever need to copy the seam_array?
205  blob_row = source.blob_row;
206  denorm = source.denorm;
207  if (source.box_word != nullptr) {
208  box_word = new tesseract::BoxWord(*source.box_word);
209  }
210  best_state = source.best_state;
211  correct_text = source.correct_text;
212  blob_widths = source.blob_widths;
213  blob_gaps = source.blob_gaps;
214  // None of the uses of operator= require the ratings matrix to be copied,
215  // so don't as it would be really slow.
216 
217  // Copy the cooked choices.
218  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
219  WERD_CHOICE_IT wc_dest_it(&best_choices);
220  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
221  const WERD_CHOICE *choice = wc_it.data();
222  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
223  }
224  if (!wc_dest_it.empty()) {
225  wc_dest_it.move_to_first();
226  best_choice = wc_dest_it.data();
227  } else {
228  best_choice = nullptr;
229  }
230 
231  if (source.raw_choice != nullptr) {
232  raw_choice = new WERD_CHOICE(*source.raw_choice);
233  } else {
234  raw_choice = nullptr;
235  }
236  if (source.ep_choice != nullptr) {
237  ep_choice = new WERD_CHOICE(*source.ep_choice);
238  } else {
239  ep_choice = nullptr;
240  }
241  reject_map = source.reject_map;
242  combination = source.combination;
243  part_of_combo = source.part_of_combo;
244  CopySimpleFields(source);
245  if (source.blamer_bundle != nullptr) {
246  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
247  }
248  return *this;
249 }
void operator=(const ELIST_LINK &)
Definition: elst.h:100

◆ PiecesAllNatural()

bool tesseract::WERD_RES::PiecesAllNatural ( int  start,
int  count 
) const

Definition at line 1111 of file pageres.cpp.

1111  {
1112  // all seams must have no splits.
1113  for (int index = start; index < start + count - 1; ++index) {
1114  if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
1115  SEAM *seam = seam_array[index];
1116  if (seam != nullptr && seam->HasAnySplits()) {
1117  return false;
1118  }
1119  }
1120  }
1121  return true;
1122 }

◆ PrintBestChoices()

void tesseract::WERD_RES::PrintBestChoices ( ) const

Definition at line 731 of file pageres.cpp.

731  {
732  std::string alternates_str;
733  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
734  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
735  if (!it.at_first()) {
736  alternates_str += "\", \"";
737  }
738  alternates_str += it.data()->unichar_string();
739  }
740  tprintf("Alternates for \"%s\": {\"%s\"}\n",
741  best_choice->unichar_string().c_str(), alternates_str.c_str());
742 }

◆ RawUTF8()

const char* tesseract::WERD_RES::RawUTF8 ( unsigned  blob_index) const
inline

Definition at line 376 of file pageres.h.

376  {
377  if (blob_index >= raw_choice->length()) {
378  return nullptr;
379  }
380  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
381  if (static_cast<unsigned>(id) >= uch_set->size()) {
382  return nullptr;
383  }
384  return uch_set->id_to_unichar(id);
385  }

◆ RebuildBestState()

void tesseract::WERD_RES::RebuildBestState ( )

Definition at line 837 of file pageres.cpp.

837  {
838  ASSERT_HOST(best_choice != nullptr);
839  delete rebuild_word;
840  rebuild_word = new TWERD;
841  if (seam_array.empty()) {
843  }
844  best_state.clear();
845  int start = 0;
846  for (unsigned i = 0; i < best_choice->length(); ++i) {
847  int length = best_choice->state(i);
848  best_state.push_back(length);
849  if (length > 1) {
851  start + length - 1);
852  }
853  TBLOB *blob = chopped_word->blobs[start];
854  rebuild_word->blobs.push_back(new TBLOB(*blob));
855  if (length > 1) {
857  start + length - 1);
858  }
859  start += length;
860  }
861 }
void start_seam_list(TWERD *word, std::vector< SEAM * > *seam_array)
Definition: seam.cpp:262
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181

◆ ReplaceBestChoice()

void tesseract::WERD_RES::ReplaceBestChoice ( WERD_CHOICE choice)

Definition at line 824 of file pageres.cpp.

824  {
825  best_choice = choice;
827  SetupBoxWord();
828  // Make up a fake reject map of the right length to keep the
829  // rejection pass happy.
833 }
void SetScriptPositions()
Definition: pageres.cpp:888
void RebuildBestState()
Definition: pageres.cpp:837

◆ SetAllScriptPositions()

void tesseract::WERD_RES::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 895 of file pageres.cpp.

895  {
897  WERD_CHOICE_IT wc_it(&best_choices);
898  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
899  wc_it.data()->SetAllScriptPositions(position);
900  }
901 }
void SetAllScriptPositions(ScriptPos position)
Definition: ratngs.cpp:592

◆ SetScriptPositions()

void tesseract::WERD_RES::SetScriptPositions ( )

Definition at line 888 of file pageres.cpp.

888  {
890 }
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:528

◆ SetupBasicsFromChoppedWord()

void tesseract::WERD_RES::SetupBasicsFromChoppedWord ( const UNICHARSET unicharset_in)

Definition at line 344 of file pageres.cpp.

344  {
349 }
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56

◆ SetupBlamerBundle()

void tesseract::WERD_RES::SetupBlamerBundle ( )

Definition at line 394 of file pageres.cpp.

394  {
395  if (blamer_bundle != nullptr) {
397  }
398 }
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151

◆ SetupBlobWidthsAndGaps()

void tesseract::WERD_RES::SetupBlobWidthsAndGaps ( )

Definition at line 401 of file pageres.cpp.

401  {
402  blob_widths.clear();
403  blob_gaps.clear();
404  int num_blobs = chopped_word->NumBlobs();
405  for (int b = 0; b < num_blobs; ++b) {
406  TBLOB *blob = chopped_word->blobs[b];
407  TBOX box = blob->bounding_box();
408  blob_widths.push_back(box.width());
409  if (b + 1 < num_blobs) {
410  blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
411  box.right());
412  }
413  }
414 }
@ TBOX
unsigned NumBlobs() const
Definition: blobs.h:449

◆ SetupBoxWord()

void tesseract::WERD_RES::SetupBoxWord ( )

Definition at line 879 of file pageres.cpp.

879  {
880  delete box_word;
884 }
void ComputeBoundingBoxes()
Definition: blobs.cpp:857
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92
const BLOCK * block() const
Definition: normalis.h:265

◆ SetupFake()

void tesseract::WERD_RES::SetupFake ( const UNICHARSET uch)

Definition at line 353 of file pageres.cpp.

353  {
354  ClearResults();
355  SetupWordScript(unicharset_in);
356  chopped_word = new TWERD;
357  rebuild_word = new TWERD;
360  int blob_count = word->cblob_list()->length();
361  if (blob_count > 0) {
362  auto **fake_choices = new BLOB_CHOICE *[blob_count];
363  // For non-text blocks, just pass any blobs through to the box_word
364  // and call the word failed with a fake classification.
365  C_BLOB_IT b_it(word->cblob_list());
366  int blob_id = 0;
367  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
368  TBOX box = b_it.data()->bounding_box();
369  box_word->InsertBox(box_word->length(), box);
370  fake_choices[blob_id++] = new BLOB_CHOICE;
371  }
372  FakeClassifyWord(blob_count, fake_choices);
373  delete[] fake_choices;
374  } else {
375  auto *word = new WERD_CHOICE(&unicharset_in);
376  word->make_bad();
378  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
379  LogNewCookedChoice(1, false, word);
380  }
381  tess_failed = true;
382  done = true;
383 }
void InsertBox(unsigned index, const TBOX &box)
Definition: boxword.cpp:157
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:908
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:385
C_BLOB_LIST * cblob_list()
Definition: werd.h:96

◆ SetupForRecognition()

bool tesseract::WERD_RES::SetupForRecognition ( const UNICHARSET unicharset_in,
tesseract::Tesseract tesseract,
Image  pix,
int  norm_mode,
const TBOX norm_box,
bool  numeric_mode,
bool  use_body_size,
bool  allow_detailed_fx,
ROW row,
const BLOCK block 
)

Definition at line 304 of file pageres.cpp.

309  {
310  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
311  tesseract = tess;
312  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
313  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
314  word->cblob_list()->empty()) ||
315  (pb != nullptr && !pb->IsText())) {
316  // Empty words occur when all the blobs have been moved to the rej_blobs
317  // list, which seems to occur frequently in junk.
318  SetupFake(unicharset_in);
319  word->set_flag(W_REP_CHAR, false);
320  return false;
321  }
322  ClearResults();
323  SetupWordScript(unicharset_in);
324  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
325  float word_xheight =
326  use_body_size && row != nullptr && row->body_size() > 0.0f
327  ? row->body_size()
328  : x_height;
329  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
330  word_xheight, baseline_shift, numeric_mode,
331  norm_mode_hint, norm_box, &denorm);
332  blob_row = row;
333  SetupBasicsFromChoppedWord(unicharset_in);
335  int num_blobs = chopped_word->NumBlobs();
336  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
337  tess_failed = false;
338  return true;
339 }
@ W_INVERSE
white on black
Definition: werd.h:43
@ W_REP_CHAR
repeated character
Definition: werd.h:40
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:55
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:778
void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:792
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:353
void SetupBlamerBundle()
Definition: pageres.cpp:394
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:344

◆ SetupWordScript()

void tesseract::WERD_RES::SetupWordScript ( const UNICHARSET unicharset_in)

Definition at line 385 of file pageres.cpp.

385  {
386  uch_set = &uch;
387  int script = uch.default_sid();
388  word->set_script_id(script);
389  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
390  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
391 }
@ W_SCRIPT_HAS_XHEIGHT
x-height concept makes sense.
Definition: werd.h:37
@ W_SCRIPT_IS_LATIN
Special case latin for y. splitting.
Definition: werd.h:38
void set_script_id(int id)
Definition: werd.h:109
int default_sid() const
Definition: unicharset.h:947

◆ StatesAllValid()

bool tesseract::WERD_RES::StatesAllValid ( )

Definition at line 461 of file pageres.cpp.

461  {
462  unsigned ratings_dim = ratings->dimension();
463  if (raw_choice->TotalOfStates() != ratings_dim) {
464  tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
465  raw_choice->TotalOfStates(), ratings_dim);
466  return false;
467  }
468  WERD_CHOICE_IT it(&best_choices);
469  unsigned index = 0;
470  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
471  WERD_CHOICE *choice = it.data();
472  if (choice->TotalOfStates() != ratings_dim) {
473  tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
474  index, choice->TotalOfStates(), ratings_dim);
475  return false;
476  }
477  }
478  return true;
479 }
unsigned TotalOfStates() const
Definition: ratngs.cpp:676

◆ SymbolDirection()

UNICHARSET::Direction tesseract::WERD_RES::SymbolDirection ( unsigned  blob_index) const
inline

Definition at line 387 of file pageres.h.

387  {
388  if (best_choice == nullptr || blob_index >= best_choice->length()) {
390  }
391  return uch_set->get_direction(best_choice->unichar_id(blob_index));
392  }

◆ UnicharsInReadingOrder()

bool tesseract::WERD_RES::UnicharsInReadingOrder ( ) const
inline

Definition at line 435 of file pageres.h.

435  {
437  }
bool unichars_in_script_order() const
Definition: ratngs.h:509

Member Data Documentation

◆ baseline_shift

float tesseract::WERD_RES::baseline_shift = 0.0f

Definition at line 316 of file pageres.h.

◆ best_choice

WERD_CHOICE* tesseract::WERD_RES::best_choice = nullptr

Definition at line 239 of file pageres.h.

◆ best_choices

WERD_CHOICE_LIST tesseract::WERD_RES::best_choices

Definition at line 247 of file pageres.h.

◆ best_state

std::vector<int> tesseract::WERD_RES::best_state

Definition at line 283 of file pageres.h.

◆ blamer_bundle

BlamerBundle* tesseract::WERD_RES::blamer_bundle = nullptr

Definition at line 250 of file pageres.h.

◆ bln_boxes

tesseract::BoxWord* tesseract::WERD_RES::bln_boxes = nullptr

Definition at line 193 of file pageres.h.

◆ blob_gaps

std::vector<int> tesseract::WERD_RES::blob_gaps

Definition at line 217 of file pageres.h.

◆ blob_row

ROW* tesseract::WERD_RES::blob_row = nullptr

Definition at line 195 of file pageres.h.

◆ blob_widths

std::vector<int> tesseract::WERD_RES::blob_widths

Definition at line 214 of file pageres.h.

◆ box_word

tesseract::BoxWord* tesseract::WERD_RES::box_word = nullptr

Definition at line 270 of file pageres.h.

◆ caps_height

float tesseract::WERD_RES::caps_height = 0.0f

Definition at line 315 of file pageres.h.

◆ chopped_word

TWERD* tesseract::WERD_RES::chopped_word = nullptr

Definition at line 210 of file pageres.h.

◆ combination

bool tesseract::WERD_RES::combination = false

Definition at line 337 of file pageres.h.

◆ correct_text

std::vector<std::string> tesseract::WERD_RES::correct_text

Definition at line 287 of file pageres.h.

◆ CTC_symbol_choices

std::vector<std::vector<std::pair<const char *, float> > > tesseract::WERD_RES::CTC_symbol_choices

Definition at line 224 of file pageres.h.

◆ denorm

DENORM tesseract::WERD_RES::denorm

Definition at line 199 of file pageres.h.

◆ done

bool tesseract::WERD_RES::done = false

Definition at line 303 of file pageres.h.

◆ end

int tesseract::WERD_RES::end = 0

Definition at line 228 of file pageres.h.

◆ ep_choice

WERD_CHOICE* tesseract::WERD_RES::ep_choice = nullptr

Definition at line 291 of file pageres.h.

◆ fontinfo

const FontInfo* tesseract::WERD_RES::fontinfo = nullptr

Definition at line 307 of file pageres.h.

◆ fontinfo2

const FontInfo* tesseract::WERD_RES::fontinfo2 = nullptr

Definition at line 308 of file pageres.h.

◆ fontinfo_id2_count

int8_t tesseract::WERD_RES::fontinfo_id2_count = 0

Definition at line 310 of file pageres.h.

◆ fontinfo_id_count

int8_t tesseract::WERD_RES::fontinfo_id_count = 0

Definition at line 309 of file pageres.h.

◆ guessed_caps_ht

bool tesseract::WERD_RES::guessed_caps_ht = true

Definition at line 312 of file pageres.h.

◆ guessed_x_ht

bool tesseract::WERD_RES::guessed_x_ht = true

Definition at line 311 of file pageres.h.

◆ leading_space

bool tesseract::WERD_RES::leading_space = false

Definition at line 226 of file pageres.h.

◆ odd_size

bool tesseract::WERD_RES::odd_size = false

Definition at line 305 of file pageres.h.

◆ part_of_combo

bool tesseract::WERD_RES::part_of_combo = false

Definition at line 338 of file pageres.h.

◆ ratings

MATRIX* tesseract::WERD_RES::ratings = nullptr

Definition at line 235 of file pageres.h.

◆ raw_choice

WERD_CHOICE* tesseract::WERD_RES::raw_choice = nullptr

Definition at line 244 of file pageres.h.

◆ rebuild_word

TWERD* tesseract::WERD_RES::rebuild_word = nullptr

Definition at line 264 of file pageres.h.

◆ reject_map

REJMAP tesseract::WERD_RES::reject_map

Definition at line 292 of file pageres.h.

◆ reject_spaces

bool tesseract::WERD_RES::reject_spaces = false

Definition at line 339 of file pageres.h.

◆ seam_array

std::vector<SEAM *> tesseract::WERD_RES::seam_array

Definition at line 212 of file pageres.h.

◆ segmented_timesteps

std::vector<std::vector<std::vector<std::pair<const char *, float> > > > tesseract::WERD_RES::segmented_timesteps

Definition at line 222 of file pageres.h.

◆ small_caps

bool tesseract::WERD_RES::small_caps = false

Definition at line 304 of file pageres.h.

◆ space_certainty

float tesseract::WERD_RES::space_certainty = 0.0f

Definition at line 319 of file pageres.h.

◆ tess_accepted

bool tesseract::WERD_RES::tess_accepted = false

Definition at line 301 of file pageres.h.

◆ tess_failed

bool tesseract::WERD_RES::tess_failed = false

Definition at line 293 of file pageres.h.

◆ tess_would_adapt

bool tesseract::WERD_RES::tess_would_adapt = false

Definition at line 302 of file pageres.h.

◆ tesseract

tesseract::Tesseract* tesseract::WERD_RES::tesseract = nullptr

Definition at line 278 of file pageres.h.

◆ timesteps

std::vector<std::vector<std::pair<const char *, float> > > tesseract::WERD_RES::timesteps

Definition at line 219 of file pageres.h.

◆ uch_set

const UNICHARSET* tesseract::WERD_RES::uch_set = nullptr

Definition at line 201 of file pageres.h.

◆ unlv_crunch_mode

CRUNCH_MODE tesseract::WERD_RES::unlv_crunch_mode = CR_NONE

Definition at line 313 of file pageres.h.

◆ word

WERD* tesseract::WERD_RES::word = nullptr

Definition at line 184 of file pageres.h.

◆ x_height

float tesseract::WERD_RES::x_height = 0.0f

Definition at line 314 of file pageres.h.


The documentation for this class was generated from the following files: