tesseract  5.0.0
tesseract::Textord Class Reference

#include <textord.h>

Public Member Functions

 Textord (CCStruct *ccstruct)
 
 ~Textord ()=default
 
void TextordPage (PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
 
void CleanupSingleRowResult (PageSegMode pageseg_mode, PAGE_RES *page_res)
 
bool use_cjk_fp_model () const
 
void set_use_cjk_fp_model (bool flag)
 
void to_spacing (ICOORD page_tr, TO_BLOCK_LIST *blocks)
 
ROWmake_prop_words (TO_ROW *row, FCOORD rotation)
 
ROWmake_blob_words (TO_ROW *row, FCOORD rotation)
 
void find_components (Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
 
void filter_blobs (ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
 
 BOOL_VAR_H (textord_single_height_mode)
 
 BOOL_VAR_H (tosp_old_to_method)
 
 BOOL_VAR_H (tosp_old_to_constrain_sp_kn)
 
 BOOL_VAR_H (tosp_only_use_prop_rows)
 
 BOOL_VAR_H (tosp_force_wordbreak_on_punct)
 
 BOOL_VAR_H (tosp_use_pre_chopping)
 
 BOOL_VAR_H (tosp_old_to_bug_fix)
 
 BOOL_VAR_H (tosp_block_use_cert_spaces)
 
 BOOL_VAR_H (tosp_row_use_cert_spaces)
 
 BOOL_VAR_H (tosp_narrow_blobs_not_cert)
 
 BOOL_VAR_H (tosp_row_use_cert_spaces1)
 
 BOOL_VAR_H (tosp_recovery_isolated_row_stats)
 
 BOOL_VAR_H (tosp_only_small_gaps_for_kern)
 
 BOOL_VAR_H (tosp_all_flips_fuzzy)
 
 BOOL_VAR_H (tosp_fuzzy_limit_all)
 
 BOOL_VAR_H (tosp_stats_use_xht_gaps)
 
 BOOL_VAR_H (tosp_use_xht_gaps)
 
 BOOL_VAR_H (tosp_only_use_xht_gaps)
 
 BOOL_VAR_H (tosp_rule_9_test_punct)
 
 BOOL_VAR_H (tosp_flip_fuzz_kn_to_sp)
 
 BOOL_VAR_H (tosp_flip_fuzz_sp_to_kn)
 
 BOOL_VAR_H (tosp_improve_thresh)
 
 INT_VAR_H (tosp_debug_level)
 
 INT_VAR_H (tosp_enough_space_samples_for_median)
 
 INT_VAR_H (tosp_redo_kern_limit)
 
 INT_VAR_H (tosp_few_samples)
 
 INT_VAR_H (tosp_short_row)
 
 INT_VAR_H (tosp_sanity_method)
 
 double_VAR_H (tosp_old_sp_kn_th_factor)
 
 double_VAR_H (tosp_threshold_bias1)
 
 double_VAR_H (tosp_threshold_bias2)
 
 double_VAR_H (tosp_narrow_fraction)
 
 double_VAR_H (tosp_narrow_aspect_ratio)
 
 double_VAR_H (tosp_wide_fraction)
 
 double_VAR_H (tosp_wide_aspect_ratio)
 
 double_VAR_H (tosp_fuzzy_space_factor)
 
 double_VAR_H (tosp_fuzzy_space_factor1)
 
 double_VAR_H (tosp_fuzzy_space_factor2)
 
 double_VAR_H (tosp_gap_factor)
 
 double_VAR_H (tosp_kern_gap_factor1)
 
 double_VAR_H (tosp_kern_gap_factor2)
 
 double_VAR_H (tosp_kern_gap_factor3)
 
 double_VAR_H (tosp_ignore_big_gaps)
 
 double_VAR_H (tosp_ignore_very_big_gaps)
 
 double_VAR_H (tosp_rep_space)
 
 double_VAR_H (tosp_enough_small_gaps)
 
 double_VAR_H (tosp_table_kn_sp_ratio)
 
 double_VAR_H (tosp_table_xht_sp_ratio)
 
 double_VAR_H (tosp_table_fuzzy_kn_sp_ratio)
 
 double_VAR_H (tosp_fuzzy_kn_fraction)
 
 double_VAR_H (tosp_fuzzy_sp_fraction)
 
 double_VAR_H (tosp_min_sane_kn_sp)
 
 double_VAR_H (tosp_init_guess_kn_mult)
 
 double_VAR_H (tosp_init_guess_xht_mult)
 
 double_VAR_H (tosp_max_sane_kn_thresh)
 
 double_VAR_H (tosp_flip_caution)
 
 double_VAR_H (tosp_large_kerning)
 
 double_VAR_H (tosp_dont_fool_with_small_kerns)
 
 double_VAR_H (tosp_near_lh_edge)
 
 double_VAR_H (tosp_silly_kn_sp_gap)
 
 double_VAR_H (tosp_pass_wide_fuzz_sp_to_context)
 
 BOOL_VAR_H (textord_no_rejects)
 
 BOOL_VAR_H (textord_show_blobs)
 
 BOOL_VAR_H (textord_show_boxes)
 
 INT_VAR_H (textord_max_noise_size)
 
 INT_VAR_H (textord_baseline_debug)
 
 double_VAR_H (textord_noise_area_ratio)
 
 double_VAR_H (textord_initialx_ile)
 
 double_VAR_H (textord_initialasc_ile)
 
 INT_VAR_H (textord_noise_sizefraction)
 
 double_VAR_H (textord_noise_sizelimit)
 
 INT_VAR_H (textord_noise_translimit)
 
 double_VAR_H (textord_noise_normratio)
 
 BOOL_VAR_H (textord_noise_rejwords)
 
 BOOL_VAR_H (textord_noise_rejrows)
 
 double_VAR_H (textord_noise_syfract)
 
 double_VAR_H (textord_noise_sxfract)
 
 double_VAR_H (textord_noise_hfract)
 
 INT_VAR_H (textord_noise_sncount)
 
 double_VAR_H (textord_noise_rowratio)
 
 BOOL_VAR_H (textord_noise_debug)
 
 double_VAR_H (textord_blshift_maxshift)
 
 double_VAR_H (textord_blshift_xfraction)
 
compute_block_xheight

Compute the xheight of the individual rows, then correlate them and interpret ascenderless lines, correcting xheights.

First we compute our best guess of the x-height of each row independently with compute_row_xheight(), which looks for a pair of commonly occurring heights that could be x-height and ascender height. This function also attempts to find descenders of lowercase letters (i.e. not the small descenders that could appear in upper case letters as Q,J).

After this computation each row falls into one of the following categories: ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be a regular row; we'll use its xheight to compute xheight and ascrise estimates for the block ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in the xheight of this row (don't use it for estimating block xheight), but this row can't contain all caps ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase (or mostly lowercase for fonts with very few ascenders), all upper case or small caps ROW_INVALID: no meaningful xheight could be found for this row

We then run correct_row_xheight() and use the computed xheight and ascrise averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND, ROW_UNKNOWN and ROW_INVALID categories.

void compute_block_xheight (TO_BLOCK *block, float gradient)
 
compute_row_xheight

Estimate the xheight of this row. Compute the ascender rise and descender drop at the same time. Set xheigh_evidence to the number of blobs with the chosen xheight that appear in this row.

void compute_row_xheight (TO_ROW *row, const FCOORD &rotation, float gradient, int block_line_size)
 
make_spline_rows

Re-fit the rows in the block to the given gradient.

void make_spline_rows (TO_BLOCK *block, float gradient, bool testing_on)
 

Detailed Description

Definition at line 76 of file textord.h.

Constructor & Destructor Documentation

◆ Textord()

tesseract::Textord::Textord ( CCStruct ccstruct)
explicit

Definition at line 35 of file textord.cpp.

36  : ccstruct_(ccstruct)
37  , use_cjk_fp_model_(false)
38  ,
39  // makerow.cpp ///////////////////////////////////////////
40  BOOL_MEMBER(textord_single_height_mode, false, "Script has no xheight, so use a single mode",
41  ccstruct_->params())
42  ,
43  // tospace.cpp ///////////////////////////////////////////
44  BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?", ccstruct_->params())
45  , BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false,
46  "Constrain relative values of inter and intra-word gaps for "
47  "old_to_method.",
48  ccstruct_->params())
49  , BOOL_MEMBER(tosp_only_use_prop_rows, true, "Block stats to use fixed pitch rows?",
50  ccstruct_->params())
51  , BOOL_MEMBER(tosp_force_wordbreak_on_punct, false,
52  "Force word breaks on punct to break long lines in non-space "
53  "delimited langs",
54  ccstruct_->params())
55  , BOOL_MEMBER(tosp_use_pre_chopping, false, "Space stats use prechopping?", ccstruct_->params())
56  , BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code", ccstruct_->params())
57  , BOOL_MEMBER(tosp_block_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params())
58  , BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params())
59  , BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, "Only stat OBVIOUS spaces", ccstruct_->params())
60  , BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces", ccstruct_->params())
61  , BOOL_MEMBER(tosp_recovery_isolated_row_stats, true,
62  "Use row alone when inadequate cert spaces", ccstruct_->params())
63  , BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, "Better guess", ccstruct_->params())
64  , BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?", ccstruct_->params())
65  , BOOL_MEMBER(tosp_fuzzy_limit_all, true, "Don't restrict kn->sp fuzzy limit to tables",
66  ccstruct_->params())
67  , BOOL_MEMBER(tosp_stats_use_xht_gaps, true, "Use within xht gap for wd breaks",
68  ccstruct_->params())
69  , BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params())
70  , BOOL_MEMBER(tosp_only_use_xht_gaps, false, "Only use within xht gap for wd breaks",
71  ccstruct_->params())
72  , BOOL_MEMBER(tosp_rule_9_test_punct, false, "Don't chng kn to space next to punct",
73  ccstruct_->params())
74  , BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip", ccstruct_->params())
75  , BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip", ccstruct_->params())
76  , BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic", ccstruct_->params())
77  , INT_MEMBER(tosp_debug_level, 0, "Debug data", ccstruct_->params())
78  , INT_MEMBER(tosp_enough_space_samples_for_median, 3, "or should we use mean",
79  ccstruct_->params())
80  , INT_MEMBER(tosp_redo_kern_limit, 10, "No.samples reqd to reestimate for row",
81  ccstruct_->params())
82  , INT_MEMBER(tosp_few_samples, 40, "No.gaps reqd with 1 large gap to treat as a table",
83  ccstruct_->params())
84  , INT_MEMBER(tosp_short_row, 20, "No.gaps reqd with few cert spaces to use certs",
85  ccstruct_->params())
86  , INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly", ccstruct_->params())
87  , double_MEMBER(tosp_old_sp_kn_th_factor, 2.0,
88  "Factor for defining space threshold in terms of space and "
89  "kern sizes",
90  ccstruct_->params())
91  , double_MEMBER(tosp_threshold_bias1, 0, "how far between kern and space?", ccstruct_->params())
92  , double_MEMBER(tosp_threshold_bias2, 0, "how far between kern and space?", ccstruct_->params())
93  , double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow", ccstruct_->params())
94  , double_MEMBER(tosp_narrow_aspect_ratio, 0.48, "narrow if w/h less than this",
95  ccstruct_->params())
96  , double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide", ccstruct_->params())
97  , double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this", ccstruct_->params())
98  , double_MEMBER(tosp_fuzzy_space_factor, 0.6, "Fract of xheight for fuzz sp",
99  ccstruct_->params())
100  , double_MEMBER(tosp_fuzzy_space_factor1, 0.5, "Fract of xheight for fuzz sp",
101  ccstruct_->params())
102  , double_MEMBER(tosp_fuzzy_space_factor2, 0.72, "Fract of xheight for fuzz sp",
103  ccstruct_->params())
104  , double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern", ccstruct_->params())
105  , double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp", ccstruct_->params())
106  , double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp", ccstruct_->params())
107  , double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp", ccstruct_->params())
108  , double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier", ccstruct_->params())
109  , double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier", ccstruct_->params())
110  , double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space", ccstruct_->params())
111  , double_MEMBER(tosp_enough_small_gaps, 0.65, "Fract of kerns reqd for isolated row stats",
112  ccstruct_->params())
113  , double_MEMBER(tosp_table_kn_sp_ratio, 2.25, "Min difference of kn & sp in table",
114  ccstruct_->params())
115  , double_MEMBER(tosp_table_xht_sp_ratio, 0.33, "Expect spaces bigger than this",
116  ccstruct_->params())
117  , double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, "Fuzzy if less than this",
118  ccstruct_->params())
119  , double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg", ccstruct_->params())
120  , double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg", ccstruct_->params())
121  , double_MEMBER(tosp_min_sane_kn_sp, 1.5, "Don't trust spaces less than this time kn",
122  ccstruct_->params())
123  , double_MEMBER(tosp_init_guess_kn_mult, 2.2, "Thresh guess - mult kn by this",
124  ccstruct_->params())
125  , double_MEMBER(tosp_init_guess_xht_mult, 0.28, "Thresh guess - mult xht by this",
126  ccstruct_->params())
127  , double_MEMBER(tosp_max_sane_kn_thresh, 5.0, "Multiplier on kn to limit thresh",
128  ccstruct_->params())
129  , double_MEMBER(tosp_flip_caution, 0.0, "Don't autoflip kn to sp when large separation",
130  ccstruct_->params())
131  , double_MEMBER(tosp_large_kerning, 0.19, "Limit use of xht gap with large kns",
132  ccstruct_->params())
133  , double_MEMBER(tosp_dont_fool_with_small_kerns, -1, "Limit use of xht gap with odd small kns",
134  ccstruct_->params())
135  , double_MEMBER(tosp_near_lh_edge, 0, "Don't reduce box if the top left is non blank",
136  ccstruct_->params())
137  , double_MEMBER(tosp_silly_kn_sp_gap, 0.2, "Don't let sp minus kn get too small",
138  ccstruct_->params())
139  , double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, "How wide fuzzies need context",
140  ccstruct_->params())
141  ,
142  // tordmain.cpp ///////////////////////////////////////////
143  BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs", ccstruct_->params())
144  , BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs", ccstruct_->params())
145  , BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs", ccstruct_->params())
146  , INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise", ccstruct_->params())
147  , INT_MEMBER(textord_baseline_debug, 0, "Baseline debug level", ccstruct_->params())
148  , double_MEMBER(textord_noise_area_ratio, 0.7, "Fraction of bounding box for noise",
149  ccstruct_->params())
150  , double_MEMBER(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess",
151  ccstruct_->params())
152  , double_MEMBER(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess",
153  ccstruct_->params())
154  , INT_MEMBER(textord_noise_sizefraction, 10, "Fraction of size for maxima", ccstruct_->params())
155  , double_MEMBER(textord_noise_sizelimit, 0.5, "Fraction of x for big t count",
156  ccstruct_->params())
157  , INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob", ccstruct_->params())
158  , double_MEMBER(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion",
159  ccstruct_->params())
160  , BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words", ccstruct_->params())
161  , BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows", ccstruct_->params())
162  , double_MEMBER(textord_noise_syfract, 0.2, "xh fract height error for norm blobs",
163  ccstruct_->params())
164  , double_MEMBER(textord_noise_sxfract, 0.4, "xh fract width error for norm blobs",
165  ccstruct_->params())
166  , double_MEMBER(textord_noise_hfract, 1.0 / 64,
167  "Height fraction to discard outlines as speckle noise", ccstruct_->params())
168  , INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row", ccstruct_->params())
169  , double_MEMBER(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion",
170  ccstruct_->params())
171  , BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector", ccstruct_->params())
172  , double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift", ccstruct_->params())
173  , double_MEMBER(textord_blshift_xfraction, 9.99, "Min size of baseline shift",
174  ccstruct_->params()) {}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:368
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:374
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:370
ParamsVectors * params()
Definition: ccutil.h:53

◆ ~Textord()

tesseract::Textord::~Textord ( )
default

Member Function Documentation

◆ BOOL_VAR_H() [1/28]

tesseract::Textord::BOOL_VAR_H ( textord_no_rejects  )

◆ BOOL_VAR_H() [2/28]

tesseract::Textord::BOOL_VAR_H ( textord_noise_debug  )

◆ BOOL_VAR_H() [3/28]

tesseract::Textord::BOOL_VAR_H ( textord_noise_rejrows  )

◆ BOOL_VAR_H() [4/28]

tesseract::Textord::BOOL_VAR_H ( textord_noise_rejwords  )

◆ BOOL_VAR_H() [5/28]

tesseract::Textord::BOOL_VAR_H ( textord_show_blobs  )

◆ BOOL_VAR_H() [6/28]

tesseract::Textord::BOOL_VAR_H ( textord_show_boxes  )

◆ BOOL_VAR_H() [7/28]

tesseract::Textord::BOOL_VAR_H ( textord_single_height_mode  )

◆ BOOL_VAR_H() [8/28]

tesseract::Textord::BOOL_VAR_H ( tosp_all_flips_fuzzy  )

◆ BOOL_VAR_H() [9/28]

tesseract::Textord::BOOL_VAR_H ( tosp_block_use_cert_spaces  )

◆ BOOL_VAR_H() [10/28]

tesseract::Textord::BOOL_VAR_H ( tosp_flip_fuzz_kn_to_sp  )

◆ BOOL_VAR_H() [11/28]

tesseract::Textord::BOOL_VAR_H ( tosp_flip_fuzz_sp_to_kn  )

◆ BOOL_VAR_H() [12/28]

tesseract::Textord::BOOL_VAR_H ( tosp_force_wordbreak_on_punct  )

◆ BOOL_VAR_H() [13/28]

tesseract::Textord::BOOL_VAR_H ( tosp_fuzzy_limit_all  )

◆ BOOL_VAR_H() [14/28]

tesseract::Textord::BOOL_VAR_H ( tosp_improve_thresh  )

◆ BOOL_VAR_H() [15/28]

tesseract::Textord::BOOL_VAR_H ( tosp_narrow_blobs_not_cert  )

◆ BOOL_VAR_H() [16/28]

tesseract::Textord::BOOL_VAR_H ( tosp_old_to_bug_fix  )

◆ BOOL_VAR_H() [17/28]

tesseract::Textord::BOOL_VAR_H ( tosp_old_to_constrain_sp_kn  )

◆ BOOL_VAR_H() [18/28]

tesseract::Textord::BOOL_VAR_H ( tosp_old_to_method  )

◆ BOOL_VAR_H() [19/28]

tesseract::Textord::BOOL_VAR_H ( tosp_only_small_gaps_for_kern  )

◆ BOOL_VAR_H() [20/28]

tesseract::Textord::BOOL_VAR_H ( tosp_only_use_prop_rows  )

◆ BOOL_VAR_H() [21/28]

tesseract::Textord::BOOL_VAR_H ( tosp_only_use_xht_gaps  )

◆ BOOL_VAR_H() [22/28]

tesseract::Textord::BOOL_VAR_H ( tosp_recovery_isolated_row_stats  )

◆ BOOL_VAR_H() [23/28]

tesseract::Textord::BOOL_VAR_H ( tosp_row_use_cert_spaces  )

◆ BOOL_VAR_H() [24/28]

tesseract::Textord::BOOL_VAR_H ( tosp_row_use_cert_spaces1  )

◆ BOOL_VAR_H() [25/28]

tesseract::Textord::BOOL_VAR_H ( tosp_rule_9_test_punct  )

◆ BOOL_VAR_H() [26/28]

tesseract::Textord::BOOL_VAR_H ( tosp_stats_use_xht_gaps  )

◆ BOOL_VAR_H() [27/28]

tesseract::Textord::BOOL_VAR_H ( tosp_use_pre_chopping  )

◆ BOOL_VAR_H() [28/28]

tesseract::Textord::BOOL_VAR_H ( tosp_use_xht_gaps  )

◆ CleanupSingleRowResult()

void tesseract::Textord::CleanupSingleRowResult ( PageSegMode  pageseg_mode,
PAGE_RES page_res 
)

Definition at line 264 of file textord.cpp.

264  {
265  if (PSM_LINE_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) {
266  return; // No cleanup required.
267  }
268  PAGE_RES_IT it(page_res);
269  // Find the best row, being the greatest mean word conf.
270  float row_total_conf = 0.0f;
271  int row_word_count = 0;
272  ROW_RES *best_row = nullptr;
273  float best_conf = 0.0f;
274  for (it.restart_page(); it.word() != nullptr; it.forward()) {
275  WERD_RES *word = it.word();
276  row_total_conf += word->best_choice->certainty();
277  ++row_word_count;
278  if (it.next_row() != it.row()) {
279  row_total_conf /= row_word_count;
280  if (best_row == nullptr || best_conf < row_total_conf) {
281  best_row = it.row();
282  best_conf = row_total_conf;
283  }
284  row_total_conf = 0.0f;
285  row_word_count = 0;
286  }
287  }
288  // Now eliminate any word not in the best row.
289  for (it.restart_page(); it.word() != nullptr; it.forward()) {
290  if (it.row() != best_row) {
291  it.DeleteCurrentWord();
292  }
293  }
294 }
bool PSM_LINE_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:203
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:197

◆ compute_block_xheight()

void tesseract::Textord::compute_block_xheight ( TO_BLOCK block,
float  gradient 
)

Definition at line 1278 of file makerow.cpp.

1278  {
1279  TO_ROW *row; // current row
1280  float asc_frac_xheight = CCStruct::kAscenderFraction / CCStruct::kXHeightFraction;
1281  float desc_frac_xheight = CCStruct::kDescenderFraction / CCStruct::kXHeightFraction;
1282  int32_t min_height, max_height; // limits on xheight
1283  TO_ROW_IT row_it = block->get_rows();
1284  if (row_it.empty()) {
1285  return; // no rows
1286  }
1287 
1288  // Compute the best guess of xheight of each row individually.
1289  // Use xheight and ascrise values of the rows where ascenders were found.
1290  get_min_max_xheight(block->line_size, &min_height, &max_height);
1291  STATS row_asc_xheights(min_height, max_height + 1);
1292  STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
1293  static_cast<int>(max_height * asc_frac_xheight) + 1);
1294  int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
1295  int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
1296  STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
1297  STATS row_desc_xheights(min_height, max_height + 1);
1298  STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
1299  STATS row_cap_xheights(min_height, max_height + 1);
1300  STATS row_cap_floating_xheights(min_height, max_height + 1);
1301  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1302  row = row_it.data();
1303  // Compute the xheight of this row if it has not been computed before.
1304  if (row->xheight <= 0) {
1305  compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size);
1306  }
1307  ROW_CATEGORY row_category = get_row_category(row);
1308  if (row_category == ROW_ASCENDERS_FOUND) {
1309  row_asc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);
1310  row_asc_ascrise.add(static_cast<int32_t>(row->ascrise), row->xheight_evidence);
1311  row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);
1312  } else if (row_category == ROW_DESCENDERS_FOUND) {
1313  row_desc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);
1314  row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);
1315  } else if (row_category == ROW_UNKNOWN) {
1316  fill_heights(row, gradient, min_height, max_height, &row_cap_xheights,
1317  &row_cap_floating_xheights);
1318  }
1319  }
1320 
1321  float xheight = 0.0;
1322  float ascrise = 0.0;
1323  float descdrop = 0.0;
1324  // Compute our best guess of xheight of this block.
1325  if (row_asc_xheights.get_total() > 0) {
1326  // Determine xheight from rows where ascenders were found.
1327  xheight = row_asc_xheights.median();
1328  ascrise = row_asc_ascrise.median();
1329  descdrop = -row_asc_descdrop.median();
1330  } else if (row_desc_xheights.get_total() > 0) {
1331  // Determine xheight from rows where descenders were found.
1332  xheight = row_desc_xheights.median();
1333  descdrop = -row_desc_descdrop.median();
1334  } else if (row_cap_xheights.get_total() > 0) {
1335  // All the rows in the block were (a/de)scenderless.
1336  // Try to search for two modes in row_cap_heights that could
1337  // be the xheight and the capheight (e.g. some of the rows
1338  // were lowercase, but did not have enough (a/de)scenders.
1339  // If such two modes can not be found, this block is most
1340  // likely all caps (or all small caps, in which case the code
1341  // still works as intended).
1343  &row_cap_xheights, &row_cap_floating_xheights,
1344  textord_single_height_mode && block->block->classify_rotation().y() == 0.0, min_height,
1345  max_height, &(xheight), &(ascrise));
1346  if (ascrise == 0) { // assume only caps in the whole block
1347  xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
1348  }
1349  } else { // default block sizes
1350  xheight = block->line_size * CCStruct::kXHeightFraction;
1351  }
1352  // Correct xheight, ascrise and descdrop if necessary.
1353  bool corrected_xheight = false;
1354  if (xheight < textord_min_xheight) {
1355  xheight = static_cast<float>(textord_min_xheight);
1356  corrected_xheight = true;
1357  }
1358  if (corrected_xheight || ascrise <= 0) {
1359  ascrise = xheight * asc_frac_xheight;
1360  }
1361  if (corrected_xheight || descdrop >= 0) {
1362  descdrop = -(xheight * desc_frac_xheight);
1363  }
1364  block->xheight = xheight;
1365 
1366  if (textord_debug_xheights) {
1367  tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n", xheight, ascrise,
1368  descdrop);
1369  }
1370  // Correct xheight, ascrise, descdrop of rows based on block averages.
1371  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1372  correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
1373  }
1374 }
void get_min_max_xheight(int block_linesize, int *min_height, int *max_height)
Definition: makerow.h:86
int textord_min_xheight
Definition: makerow.cpp:70
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
ROW_CATEGORY
Definition: makerow.h:36
@ ROW_ASCENDERS_FOUND
Definition: makerow.h:37
@ ROW_DESCENDERS_FOUND
Definition: makerow.h:38
@ ROW_UNKNOWN
Definition: makerow.h:39
bool textord_debug_xheights
Definition: makerow.cpp:59
void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, STATS *floating_heights)
Definition: makerow.cpp:1418
int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only, int min_height, int max_height, float *xheight, float *ascrise)
Definition: makerow.cpp:1480
void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop)
Definition: makerow.cpp:1690
ROW_CATEGORY get_row_category(const TO_ROW *row)
Definition: makerow.h:94
static const double kXHeightCapRatio
Definition: ccstruct.h:37
static const double kXHeightFraction
Definition: ccstruct.h:34
static const double kDescenderFraction
Definition: ccstruct.h:33
static const double kAscenderFraction
Definition: ccstruct.h:35
void compute_row_xheight(TO_ROW *row, const FCOORD &rotation, float gradient, int block_line_size)
Definition: makerow.cpp:1384

◆ compute_row_xheight()

void tesseract::Textord::compute_row_xheight ( TO_ROW row,
const FCOORD rotation,
float  gradient,
int  block_line_size 
)

Definition at line 1384 of file makerow.cpp.

1387  {
1388  // Find blobs representing repeated characters in rows and mark them.
1389  // This information is used for computing row xheight and at a later
1390  // stage when words are formed by make_words.
1391  if (!row->rep_chars_marked()) {
1392  mark_repeated_chars(row);
1393  }
1394 
1395  int min_height, max_height;
1396  get_min_max_xheight(block_line_size, &min_height, &max_height);
1397  STATS heights(min_height, max_height + 1);
1398  STATS floating_heights(min_height, max_height + 1);
1399  fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights);
1400  row->ascrise = 0.0f;
1401  row->xheight = 0.0f;
1402  row->xheight_evidence = compute_xheight_from_modes(
1403  &heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height,
1404  max_height, &(row->xheight), &(row->ascrise));
1405  row->descdrop = 0.0f;
1406  if (row->xheight > 0) {
1407  row->descdrop =
1408  static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
1409  }
1410 }
void mark_repeated_chars(TO_ROW *row)
Definition: makerow.cpp:2563
int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count, STATS *asc_heights)
Definition: makerow.cpp:1576

◆ double_VAR_H() [1/44]

tesseract::Textord::double_VAR_H ( textord_blshift_maxshift  )

◆ double_VAR_H() [2/44]

tesseract::Textord::double_VAR_H ( textord_blshift_xfraction  )

◆ double_VAR_H() [3/44]

tesseract::Textord::double_VAR_H ( textord_initialasc_ile  )

◆ double_VAR_H() [4/44]

tesseract::Textord::double_VAR_H ( textord_initialx_ile  )

◆ double_VAR_H() [5/44]

tesseract::Textord::double_VAR_H ( textord_noise_area_ratio  )

◆ double_VAR_H() [6/44]

tesseract::Textord::double_VAR_H ( textord_noise_hfract  )

◆ double_VAR_H() [7/44]

tesseract::Textord::double_VAR_H ( textord_noise_normratio  )

◆ double_VAR_H() [8/44]

tesseract::Textord::double_VAR_H ( textord_noise_rowratio  )

◆ double_VAR_H() [9/44]

tesseract::Textord::double_VAR_H ( textord_noise_sizelimit  )

◆ double_VAR_H() [10/44]

tesseract::Textord::double_VAR_H ( textord_noise_sxfract  )

◆ double_VAR_H() [11/44]

tesseract::Textord::double_VAR_H ( textord_noise_syfract  )

◆ double_VAR_H() [12/44]

tesseract::Textord::double_VAR_H ( tosp_dont_fool_with_small_kerns  )

◆ double_VAR_H() [13/44]

tesseract::Textord::double_VAR_H ( tosp_enough_small_gaps  )

◆ double_VAR_H() [14/44]

tesseract::Textord::double_VAR_H ( tosp_flip_caution  )

◆ double_VAR_H() [15/44]

tesseract::Textord::double_VAR_H ( tosp_fuzzy_kn_fraction  )

◆ double_VAR_H() [16/44]

tesseract::Textord::double_VAR_H ( tosp_fuzzy_sp_fraction  )

◆ double_VAR_H() [17/44]

tesseract::Textord::double_VAR_H ( tosp_fuzzy_space_factor  )

◆ double_VAR_H() [18/44]

tesseract::Textord::double_VAR_H ( tosp_fuzzy_space_factor1  )

◆ double_VAR_H() [19/44]

tesseract::Textord::double_VAR_H ( tosp_fuzzy_space_factor2  )

◆ double_VAR_H() [20/44]

tesseract::Textord::double_VAR_H ( tosp_gap_factor  )

◆ double_VAR_H() [21/44]

tesseract::Textord::double_VAR_H ( tosp_ignore_big_gaps  )

◆ double_VAR_H() [22/44]

tesseract::Textord::double_VAR_H ( tosp_ignore_very_big_gaps  )

◆ double_VAR_H() [23/44]

tesseract::Textord::double_VAR_H ( tosp_init_guess_kn_mult  )

◆ double_VAR_H() [24/44]

tesseract::Textord::double_VAR_H ( tosp_init_guess_xht_mult  )

◆ double_VAR_H() [25/44]

tesseract::Textord::double_VAR_H ( tosp_kern_gap_factor1  )

◆ double_VAR_H() [26/44]

tesseract::Textord::double_VAR_H ( tosp_kern_gap_factor2  )

◆ double_VAR_H() [27/44]

tesseract::Textord::double_VAR_H ( tosp_kern_gap_factor3  )

◆ double_VAR_H() [28/44]

tesseract::Textord::double_VAR_H ( tosp_large_kerning  )

◆ double_VAR_H() [29/44]

tesseract::Textord::double_VAR_H ( tosp_max_sane_kn_thresh  )

◆ double_VAR_H() [30/44]

tesseract::Textord::double_VAR_H ( tosp_min_sane_kn_sp  )

◆ double_VAR_H() [31/44]

tesseract::Textord::double_VAR_H ( tosp_narrow_aspect_ratio  )

◆ double_VAR_H() [32/44]

tesseract::Textord::double_VAR_H ( tosp_narrow_fraction  )

◆ double_VAR_H() [33/44]

tesseract::Textord::double_VAR_H ( tosp_near_lh_edge  )

◆ double_VAR_H() [34/44]

tesseract::Textord::double_VAR_H ( tosp_old_sp_kn_th_factor  )

◆ double_VAR_H() [35/44]

tesseract::Textord::double_VAR_H ( tosp_pass_wide_fuzz_sp_to_context  )

◆ double_VAR_H() [36/44]

tesseract::Textord::double_VAR_H ( tosp_rep_space  )

◆ double_VAR_H() [37/44]

tesseract::Textord::double_VAR_H ( tosp_silly_kn_sp_gap  )

◆ double_VAR_H() [38/44]

tesseract::Textord::double_VAR_H ( tosp_table_fuzzy_kn_sp_ratio  )

◆ double_VAR_H() [39/44]

tesseract::Textord::double_VAR_H ( tosp_table_kn_sp_ratio  )

◆ double_VAR_H() [40/44]

tesseract::Textord::double_VAR_H ( tosp_table_xht_sp_ratio  )

◆ double_VAR_H() [41/44]

tesseract::Textord::double_VAR_H ( tosp_threshold_bias1  )

◆ double_VAR_H() [42/44]

tesseract::Textord::double_VAR_H ( tosp_threshold_bias2  )

◆ double_VAR_H() [43/44]

tesseract::Textord::double_VAR_H ( tosp_wide_aspect_ratio  )

◆ double_VAR_H() [44/44]

tesseract::Textord::double_VAR_H ( tosp_wide_fraction  )

◆ filter_blobs()

void tesseract::Textord::filter_blobs ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks,
bool  testing_on 
)

Definition at line 238 of file tordmain.cpp.

240  { // for plotting
241  TO_BLOCK_IT block_it = blocks; // destination iterator
242  TO_BLOCK *block; // created block
243 
244 #ifndef GRAPHICS_DISABLED
245  if (to_win != nullptr) {
246  to_win->Clear();
247  }
248 #endif // !GRAPHICS_DISABLED
249 
250  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
251  block = block_it.data();
252  block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs,
253  &block->large_blobs);
254  if (block->line_size == 0) {
255  block->line_size = 1;
256  }
257  block->line_spacing =
258  block->line_size *
262  block->line_size *= textord_min_linesize;
263  block->max_blob_size = block->line_size * textord_excess_blobsize;
264 
265 #ifndef GRAPHICS_DISABLED
266  if (textord_show_blobs && testing_on) {
267  if (to_win == nullptr) {
268  create_to_win(page_tr);
269  }
270  block->plot_graded_blobs(to_win);
271  }
272  if (textord_show_boxes && testing_on) {
273  if (to_win == nullptr) {
274  create_to_win(page_tr);
275  }
276  plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
277  plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
278  plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
279  plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
280  }
281 #endif // !GRAPHICS_DISABLED
282  }
283 }
double textord_excess_blobsize
Definition: makerow.cpp:81
ScrollView * to_win
Definition: drawtord.cpp:37
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:69
double textord_min_linesize
Definition: makerow.cpp:80
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47

◆ find_components()

void tesseract::Textord::find_components ( Image  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 211 of file tordmain.cpp.

211  {
212  int width = pixGetWidth(pix);
213  int height = pixGetHeight(pix);
214  if (width > INT16_MAX || height > INT16_MAX) {
215  tprintf("Input image too large! (%d, %d)\n", width, height);
216  return; // Can't handle it.
217  }
218 
219  BLOCK_IT block_it(blocks); // iterator
220  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
221  BLOCK *block = block_it.data();
222  if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {
223  extract_edges(pix, block);
224  }
225  }
226 
227  assign_blobs_to_blocks2(pix, blocks, to_blocks);
228  ICOORD page_tr(width, height);
229  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
230 }
bool textord_test_landscape
Definition: makerow.cpp:52
void assign_blobs_to_blocks2(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:162
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
Definition: tordmain.cpp:238

◆ INT_VAR_H() [1/11]

tesseract::Textord::INT_VAR_H ( textord_baseline_debug  )

◆ INT_VAR_H() [2/11]

tesseract::Textord::INT_VAR_H ( textord_max_noise_size  )

◆ INT_VAR_H() [3/11]

tesseract::Textord::INT_VAR_H ( textord_noise_sizefraction  )

◆ INT_VAR_H() [4/11]

tesseract::Textord::INT_VAR_H ( textord_noise_sncount  )

◆ INT_VAR_H() [5/11]

tesseract::Textord::INT_VAR_H ( textord_noise_translimit  )

◆ INT_VAR_H() [6/11]

tesseract::Textord::INT_VAR_H ( tosp_debug_level  )

◆ INT_VAR_H() [7/11]

tesseract::Textord::INT_VAR_H ( tosp_enough_space_samples_for_median  )

◆ INT_VAR_H() [8/11]

tesseract::Textord::INT_VAR_H ( tosp_few_samples  )

◆ INT_VAR_H() [9/11]

tesseract::Textord::INT_VAR_H ( tosp_redo_kern_limit  )

◆ INT_VAR_H() [10/11]

tesseract::Textord::INT_VAR_H ( tosp_sanity_method  )

◆ INT_VAR_H() [11/11]

tesseract::Textord::INT_VAR_H ( tosp_short_row  )

◆ make_blob_words()

ROW * tesseract::Textord::make_blob_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 1118 of file tospace.cpp.

1120  {
1121  bool bol; // start of line
1122  ROW *real_row; // output row
1123  C_OUTLINE_IT cout_it;
1124  C_BLOB_LIST cblobs;
1125  C_BLOB_IT cblob_it = &cblobs;
1126  WERD_LIST words;
1127  WERD *word; // new word
1128  BLOBNBOX_IT box_it; // iterator
1129  int16_t word_count = 0;
1130 
1131  cblob_it.set_to_list(&cblobs);
1132  box_it.set_to_list(row->blob_list());
1133  // new words
1134  WERD_IT word_it(&words);
1135  bol = true;
1136  if (!box_it.empty()) {
1137  do {
1138  auto bblob = box_it.data();
1139  auto blob_box = bblob->bounding_box();
1140  if (bblob->joined_to_prev()) {
1141  auto cblob = bblob->remove_cblob();
1142  if (cblob != nullptr) {
1143  cout_it.set_to_list(cblob_it.data()->out_list());
1144  cout_it.move_to_last();
1145  cout_it.add_list_after(cblob->out_list());
1146  delete cblob;
1147  }
1148  } else {
1149  auto cblob = bblob->cblob();
1150  if (cblob != nullptr) {
1151  bblob->set_owns_cblob(false);
1152  cblob_it.add_after_then_move(cblob);
1153  }
1154  }
1155  box_it.forward(); // next one
1156  bblob = box_it.data();
1157  blob_box = bblob->bounding_box();
1158 
1159  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1160  word = new WERD(&cblobs, 1, nullptr);
1161  word_count++;
1162  word_it.add_after_then_move(word);
1163  if (bol) {
1164  word->set_flag(W_BOL, true);
1165  bol = false;
1166  }
1167  if (box_it.at_first()) { // at end of line
1168  word->set_flag(W_EOL, true);
1169  }
1170  }
1171  } while (!box_it.at_first()); // until back at start
1172  /* Setup the row with created words. */
1173  real_row =
1174  new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1175  word_it.set_to_list(real_row->word_list());
1176  // put words in row
1177  word_it.add_list_after(&words);
1178  real_row->recalc_bounding_box();
1179  if (tosp_debug_level > 4) {
1180  tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1181  real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1182  real_row->bounding_box().right(), real_row->bounding_box().top());
1183  }
1184  return real_row;
1185  }
1186  return nullptr;
1187 }
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35

◆ make_prop_words()

ROW * tesseract::Textord::make_prop_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 844 of file tospace.cpp.

846  {
847  bool bol; // start of line
848  /* prev_ values are for start of word being built. non prev_ values are for
849 the gap between the word being built and the next one. */
850  bool prev_fuzzy_sp; // probably space
851  bool prev_fuzzy_non; // probably not
852  uint8_t prev_blanks; // in front of word
853  bool fuzzy_sp = false; // probably space
854  bool fuzzy_non = false; // probably not
855  uint8_t blanks = 0; // in front of word
856  bool prev_gap_was_a_space = false;
857  bool break_at_next_gap = false;
858  ROW *real_row; // output row
859  C_OUTLINE_IT cout_it;
860  C_BLOB_LIST cblobs;
861  C_BLOB_IT cblob_it = &cblobs;
862  WERD_LIST words;
863  WERD *word; // new word
864  int32_t next_rep_char_word_right = INT32_MAX;
865  float repetition_spacing; // gap between repetitions
866  int32_t xstarts[2]; // row ends
867  int32_t prev_x; // end of prev blob
868  BLOBNBOX_IT box_it; // iterator
869  TBOX prev_blob_box;
870  TBOX next_blob_box;
871  int16_t prev_gap = INT16_MAX;
872  int16_t current_gap = INT16_MAX;
873  int16_t next_gap = INT16_MAX;
874  int16_t prev_within_xht_gap = INT16_MAX;
875  int16_t current_within_xht_gap = INT16_MAX;
876  int16_t next_within_xht_gap = INT16_MAX;
877  int16_t word_count = 0;
878 
879  // repeated char words
880  WERD_IT rep_char_it(&(row->rep_words));
881  if (!rep_char_it.empty()) {
882  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
883  }
884 
885  prev_x = -INT16_MAX;
886  cblob_it.set_to_list(&cblobs);
887  box_it.set_to_list(row->blob_list());
888  // new words
889  WERD_IT word_it(&words);
890  bol = true;
891  prev_blanks = 0;
892  prev_fuzzy_sp = false;
893  prev_fuzzy_non = false;
894  if (!box_it.empty()) {
895  xstarts[0] = box_it.data()->bounding_box().left();
896  if (xstarts[0] > next_rep_char_word_right) {
897  /* We need to insert a repeated char word at the start of the row */
898  word = rep_char_it.extract();
899  word_it.add_after_then_move(word);
900  /* Set spaces before repeated char word */
901  word->set_flag(W_BOL, true);
902  bol = false;
903  word->set_blanks(0);
904  // NO uncertainty
905  word->set_flag(W_FUZZY_SP, false);
906  word->set_flag(W_FUZZY_NON, false);
907  xstarts[0] = word->bounding_box().left();
908  /* Set spaces after repeated char word (and leave current word set) */
909  repetition_spacing = find_mean_blob_spacing(word);
910  current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
911  current_within_xht_gap = current_gap;
912  if (current_gap > tosp_rep_space * repetition_spacing) {
913  prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
914  if (prev_blanks < 1) {
915  prev_blanks = 1;
916  }
917  } else {
918  prev_blanks = 0;
919  }
920  if (tosp_debug_level > 5) {
921  tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
922  box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
923  repetition_spacing, current_gap);
924  }
925  prev_fuzzy_sp = false;
926  prev_fuzzy_non = false;
927  if (rep_char_it.empty()) {
928  next_rep_char_word_right = INT32_MAX;
929  } else {
930  rep_char_it.forward();
931  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
932  }
933  }
934 
935  peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
936  do {
937  auto bblob = box_it.data();
938  auto blob_box = bblob->bounding_box();
939  if (bblob->joined_to_prev()) {
940  auto cblob = bblob->remove_cblob();
941  if (cblob != nullptr) {
942  cout_it.set_to_list(cblob_it.data()->out_list());
943  cout_it.move_to_last();
944  cout_it.add_list_after(cblob->out_list());
945  delete cblob;
946  }
947  } else {
948  auto cblob = bblob->cblob();
949  if (cblob != nullptr) {
950  bblob->set_owns_cblob(false);
951  cblob_it.add_after_then_move(cblob);
952  }
953  prev_x = blob_box.right();
954  }
955  box_it.forward(); // next one
956  bblob = box_it.data();
957  blob_box = bblob->bounding_box();
958 
959  if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
960  /* Real Blob - not multiple outlines or pre-chopped */
961  prev_gap = current_gap;
962  prev_within_xht_gap = current_within_xht_gap;
963  prev_blob_box = next_blob_box;
964  current_gap = next_gap;
965  current_within_xht_gap = next_within_xht_gap;
966  peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
967 
968  int16_t prev_gap_arg = prev_gap;
969  int16_t next_gap_arg = next_gap;
970  if (tosp_only_use_xht_gaps) {
971  prev_gap_arg = prev_within_xht_gap;
972  next_gap_arg = next_within_xht_gap;
973  }
974  // Decide if a word-break should be inserted
975  if (blob_box.left() > next_rep_char_word_right ||
976  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
977  current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
978  fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
979  box_it.at_first()) {
980  /* Form a new word out of the blobs collected */
981  word = new WERD(&cblobs, prev_blanks, nullptr);
982  word_count++;
983  word_it.add_after_then_move(word);
984  if (bol) {
985  word->set_flag(W_BOL, true);
986  bol = false;
987  }
988  if (prev_fuzzy_sp) {
989  // probably space
990  word->set_flag(W_FUZZY_SP, true);
991  } else if (prev_fuzzy_non) {
992  word->set_flag(W_FUZZY_NON, true);
993  }
994  // probably not
995 
996  if (blob_box.left() > next_rep_char_word_right) {
997  /* We need to insert a repeated char word */
998  word = rep_char_it.extract();
999  word_it.add_after_then_move(word);
1000 
1001  /* Set spaces before repeated char word */
1002  repetition_spacing = find_mean_blob_spacing(word);
1003  current_gap = word->bounding_box().left() - prev_x;
1004  current_within_xht_gap = current_gap;
1005  if (current_gap > tosp_rep_space * repetition_spacing) {
1006  blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1007  if (blanks < 1) {
1008  blanks = 1;
1009  }
1010  } else {
1011  blanks = 0;
1012  }
1013  if (tosp_debug_level > 5) {
1014  tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1015  word->bounding_box().left(), word->bounding_box().bottom(),
1016  repetition_spacing, current_gap, blanks);
1017  }
1018  word->set_blanks(blanks);
1019  // NO uncertainty
1020  word->set_flag(W_FUZZY_SP, false);
1021  word->set_flag(W_FUZZY_NON, false);
1022 
1023  /* Set spaces after repeated char word (and leave current word set)
1024  */
1025  current_gap = blob_box.left() - next_rep_char_word_right;
1026  if (current_gap > tosp_rep_space * repetition_spacing) {
1027  blanks = static_cast<uint8_t>(current_gap / row->space_size);
1028  if (blanks < 1) {
1029  blanks = 1;
1030  }
1031  } else {
1032  blanks = 0;
1033  }
1034  if (tosp_debug_level > 5) {
1035  tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1036  }
1037  fuzzy_sp = false;
1038  fuzzy_non = false;
1039 
1040  if (rep_char_it.empty()) {
1041  next_rep_char_word_right = INT32_MAX;
1042  } else {
1043  rep_char_it.forward();
1044  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1045  }
1046  }
1047 
1048  if (box_it.at_first() && rep_char_it.empty()) {
1049  // at end of line
1050  word->set_flag(W_EOL, true);
1051  xstarts[1] = prev_x;
1052  } else {
1053  prev_blanks = blanks;
1054  prev_fuzzy_sp = fuzzy_sp;
1055  prev_fuzzy_non = fuzzy_non;
1056  }
1057  }
1058  }
1059  } while (!box_it.at_first()); // until back at start
1060 
1061  /* Insert any further repeated char words */
1062  while (!rep_char_it.empty()) {
1063  word = rep_char_it.extract();
1064  word_it.add_after_then_move(word);
1065 
1066  /* Set spaces before repeated char word */
1067  repetition_spacing = find_mean_blob_spacing(word);
1068  current_gap = word->bounding_box().left() - prev_x;
1069  if (current_gap > tosp_rep_space * repetition_spacing) {
1070  blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1071  if (blanks < 1) {
1072  blanks = 1;
1073  }
1074  } else {
1075  blanks = 0;
1076  }
1077  if (tosp_debug_level > 5) {
1078  tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1079  word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1080  current_gap, blanks);
1081  }
1082  word->set_blanks(blanks);
1083  // NO uncertainty
1084  word->set_flag(W_FUZZY_SP, false);
1085  word->set_flag(W_FUZZY_NON, false);
1086  prev_x = word->bounding_box().right();
1087  if (rep_char_it.empty()) {
1088  // at end of line
1089  word->set_flag(W_EOL, true);
1090  xstarts[1] = prev_x;
1091  } else {
1092  rep_char_it.forward();
1093  }
1094  }
1095  real_row =
1096  new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1097  word_it.set_to_list(real_row->word_list());
1098  // put words in row
1099  word_it.add_list_after(&words);
1100  real_row->recalc_bounding_box();
1101 
1102  if (tosp_debug_level > 4) {
1103  tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1104  real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1105  real_row->bounding_box().right(), real_row->bounding_box().top());
1106  }
1107  return real_row;
1108  }
1109  return nullptr;
1110 }
@ TBOX
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42

◆ make_spline_rows()

void tesseract::Textord::make_spline_rows ( TO_BLOCK block,
float  gradient,
bool  testing_on 
)

Definition at line 1998 of file makerow.cpp.

2000  {
2001 #ifndef GRAPHICS_DISABLED
2002  ScrollView::Color colour; // of row
2003 #endif
2004  TO_ROW_IT row_it = block->get_rows();
2005 
2006  row_it.move_to_first();
2007  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2008  if (row_it.data()->blob_list()->empty()) {
2009  delete row_it.extract(); // nothing in it
2010  } else {
2011  make_baseline_spline(row_it.data(), block);
2012  }
2013  }
2014  if (textord_old_baselines) {
2015 #ifndef GRAPHICS_DISABLED
2016  if (testing_on) {
2017  colour = ScrollView::RED;
2018  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2019  row_it.data()->baseline.plot(to_win, colour);
2020  colour = static_cast<ScrollView::Color>(colour + 1);
2021  if (colour > ScrollView::MAGENTA) {
2022  colour = ScrollView::RED;
2023  }
2024  }
2025  }
2026 #endif
2027  make_old_baselines(block, testing_on, gradient);
2028  }
2029 #ifndef GRAPHICS_DISABLED
2030  if (testing_on) {
2031  colour = ScrollView::RED;
2032  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2033  row_it.data()->baseline.plot(to_win, colour);
2034  colour = static_cast<ScrollView::Color>(colour + 1);
2035  if (colour > ScrollView::MAGENTA) {
2036  colour = ScrollView::RED;
2037  }
2038  }
2039  }
2040 #endif
2041 }
bool textord_old_baselines
Definition: makerow.cpp:55
void make_baseline_spline(TO_ROW *row, TO_BLOCK *block)
Definition: makerow.cpp:2050

◆ set_use_cjk_fp_model()

void tesseract::Textord::set_use_cjk_fp_model ( bool  flag)
inline

Definition at line 101 of file textord.h.

101  {
102  use_cjk_fp_model_ = flag;
103  }

◆ TextordPage()

void tesseract::Textord::TextordPage ( PageSegMode  pageseg_mode,
const FCOORD reskew,
int  width,
int  height,
Image  binary_pix,
Image  thresholds_pix,
Image  grey_pix,
bool  use_box_bottoms,
BLOBNBOX_LIST *  diacritic_blobs,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 177 of file textord.cpp.

180  {
181  page_tr_.set_x(width);
182  page_tr_.set_y(height);
183  if (to_blocks->empty()) {
184  // AutoPageSeg was not used, so we need to find_components first.
185  find_components(binary_pix, blocks, to_blocks);
186  TO_BLOCK_IT it(to_blocks);
187  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
188  TO_BLOCK *to_block = it.data();
189  // Compute the edge offsets whether or not there is a grey_pix.
190  // We have by-passed auto page seg, so we have to run it here.
191  // By page segmentation mode there is no non-text to avoid running on.
192  to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);
193  }
194  } else if (!PSM_SPARSE(pageseg_mode)) {
195  // AutoPageSeg does not need to find_components as it did that already.
196  // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
197  filter_blobs(page_tr_, to_blocks, true);
198  }
199 
200  ASSERT_HOST(!to_blocks->empty());
201  if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {
202  const FCOORD anticlockwise90(0.0f, 1.0f);
203  const FCOORD clockwise90(0.0f, -1.0f);
204  TO_BLOCK_IT it(to_blocks);
205  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
206  TO_BLOCK *to_block = it.data();
207  BLOCK *block = to_block->block;
208  // Create a fake poly_block in block from its bounding box.
209  block->pdblk.set_poly_block(new POLY_BLOCK(block->pdblk.bounding_box(), PT_VERTICAL_TEXT));
210  // Rotate the to_block along with its contained block and blobnbox lists.
211  to_block->rotate(anticlockwise90);
212  // Set the block's rotation values to obey the convention followed in
213  // layout analysis for vertical text.
214  block->set_re_rotation(clockwise90);
215  block->set_classify_rotation(clockwise90);
216  }
217  }
218 
219  TO_BLOCK_IT to_block_it(to_blocks);
220  TO_BLOCK *to_block = to_block_it.data();
221  // Make the rows in the block.
222  float gradient;
223  // Do it the old fashioned way.
224  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
225  gradient = make_rows(page_tr_, to_blocks);
226  } else if (!PSM_SPARSE(pageseg_mode)) {
227  // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
228  gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks);
229  } else {
230  gradient = 0.0f;
231  }
232  BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks);
233  baseline_detector.ComputeStraightBaselines(use_box_bottoms);
234  baseline_detector.ComputeBaselineSplinesAndXheights(
235  page_tr_, pageseg_mode != PSM_RAW_LINE, textord_heavy_nr, textord_show_final_rows, this);
236  // Now make the words in the lines.
237  if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {
238  // SINGLE_LINE uses the old word maker on the single line.
239  make_words(this, page_tr_, gradient, blocks, to_blocks);
240  } else {
241  // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
242  // single word, and in SINGLE_CHAR mode, all the outlines
243  // go in a single blob.
244  TO_BLOCK *to_block = to_block_it.data();
245  make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(),
246  to_block->block->row_list());
247  }
248  // Remove empties.
249  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
250  TransferDiacriticsToBlockGroups(diacritic_blobs, blocks);
251  // Compute the margins for each row in the block, to be used later for
252  // paragraph detection.
253  BLOCK_IT b_it(blocks);
254  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
255  b_it.data()->compute_row_margins();
256  }
257 #ifndef GRAPHICS_DISABLED
258  close_to_win();
259 #endif
260 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
bool textord_show_final_rows
Definition: makerow.cpp:50
@ PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:166
@ PSM_SINGLE_CHAR
Treat the image as a single character.
Definition: publictypes.h:172
void close_to_win()
Definition: drawtord.cpp:56
float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: makerow.cpp:229
void make_words(tesseract::Textord *textord, ICOORD page_tr, float gradient, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: wordseg.cpp:99
bool textord_heavy_nr
Definition: makerow.cpp:46
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows)
Definition: wordseg.cpp:53
bool PSM_WORD_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:206
float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks)
Definition: makerow.cpp:190
@ PT_VERTICAL_TEXT
Definition: publictypes.h:61
void set_x(TDimension xin)
rewrite function
Definition: points.h:67
void set_y(TDimension yin)
rewrite function
Definition: points.h:71
void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:211

◆ to_spacing()

void tesseract::Textord::to_spacing ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks 
)

Definition at line 45 of file tospace.cpp.

47  {
48  TO_BLOCK_IT block_it; // iterator
49  TO_BLOCK *block; // current block;
50  TO_ROW *row; // current row
51  int block_index; // block number
52  int row_index; // row number
53  // estimated width of real spaces for whole block
54  int16_t block_space_gap_width;
55  // estimated width of non space gaps for whole block
56  int16_t block_non_space_gap_width;
57  bool old_text_ord_proportional; // old fixed/prop result
58 
59  block_it.set_to_list(blocks);
60  block_index = 1;
61  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62  block = block_it.data();
63  std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64  block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65  block_non_space_gap_width);
66  // Make sure relative values of block-level space and non-space gap
67  // widths are reasonable. The ratio of 1:3 is also used in
68  // block_spacing_stats, to correct the block_space_gap_width.
69  // Useful for arabic and hindi, when the non-space gap width is
70  // often over-estimated and should not be trusted. A similar ratio
71  // is found in block_spacing_stats.
72  if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73  block_non_space_gap_width > block_space_gap_width / 3) {
74  block_non_space_gap_width = block_space_gap_width / 3;
75  }
76  // row iterator
77  TO_ROW_IT row_it(block->get_rows());
78  row_index = 1;
79  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80  row = row_it.data();
81  if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
82  if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83  tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84  }
85  row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86  block_non_space_gap_width);
87  } else {
88  if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89  tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90  row_index, row->pitch_decision, row->fixed_pitch);
91  }
92  }
93 #ifndef GRAPHICS_DISABLED
95  plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96  }
97 #endif
98  row_index++;
99  }
100  block_index++;
101  }
102 }
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
bool textord_show_initial_words
Definition: tovars.cpp:25

◆ use_cjk_fp_model()

bool tesseract::Textord::use_cjk_fp_model ( ) const
inline

Definition at line 98 of file textord.h.

98  {
99  return use_cjk_fp_model_;
100  }

The documentation for this class was generated from the following files: