tesseract  5.0.0
tesseract::OL_BUCKETS Class Reference

#include <edgblob.h>

Public Member Functions

OL_BUCKETS::OL_BUCKETS

Construct an array of buckets for associating outlines into blobs.

 OL_BUCKETS (ICOORD bleft, ICOORD tright)
 
OL_BUCKETS::count_children

Find number of descendants of this outline.

int32_t count_children (C_OUTLINE *outline, int32_t max_count)
 
OL_BUCKETS::outline_complexity

This is the new version of count_child.

The goal of this function is to determine if an outline and its interiors could be part of a character blob. This is done by computing a "complexity" index for the outline, which is the return value of this function, and checking it against a threshold. The max_count is used for short-circuiting the recursion and forcing a rejection that guarantees to fail the threshold test. The complexity F for outline X with N children X[i] is F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild so each layer of nesting increases complexity exponentially. An outline can be rejected as a text blob candidate if its complexity is too high, has too many children(likely a container), or has too many layers of nested inner loops. This has the side-effect of flattening out boxed or reversed video text regions.

int32_t outline_complexity (C_OUTLINE *outline, int32_t max_count, int16_t depth)
 
OL_BUCKETS::extract_children

Find number of descendants of this outline.

void extract_children (C_OUTLINE *outline, C_OUTLINE_IT *it)
 

OL_BUCKETS::operator(

Return a pointer to a list of C_OUTLINEs corresponding to the given pixel coordinates.

C_OUTLINE_LIST * operator() (TDimension x, TDimension y)
 
C_OUTLINE_LIST * start_scan ()
 
C_OUTLINE_LIST * scan_next ()
 

Detailed Description

Definition at line 30 of file edgblob.h.

Constructor & Destructor Documentation

◆ OL_BUCKETS()

tesseract::OL_BUCKETS::OL_BUCKETS ( ICOORD  bleft,
ICOORD  tright 
)

Definition at line 64 of file edgblob.cpp.

66  : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),
67  bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),
68  buckets(bxdim * bydim),
69  bl(bleft),
70  tr(tright) {}
#define BUCKETSIZE
Definition: edgblob.cpp:29

Member Function Documentation

◆ count_children()

int32_t tesseract::OL_BUCKETS::count_children ( C_OUTLINE outline,
int32_t  max_count 
)

Definition at line 195 of file edgblob.cpp.

198  {
199  bool parent_box; // could it be boxy
200  TDimension xmin, xmax; // coord limits
201  TDimension ymin, ymax;
202  C_OUTLINE *child; // current child
203  int32_t child_count; // no of children
204  int32_t grandchild_count; // no of grandchildren
205  int32_t parent_area; // potential box
206  float max_parent_area; // potential box
207  int32_t child_area; // current child
208  int32_t child_length; // current child
209  TBOX olbox;
210  C_OUTLINE_IT child_it; // search iterator
211 
212  olbox = outline->bounding_box();
213  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
214  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
215  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
216  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
217  child_count = 0;
218  grandchild_count = 0;
219  parent_area = 0;
220  max_parent_area = 0;
221  parent_box = true;
222  for (auto yindex = ymin; yindex <= ymax; yindex++) {
223  for (auto xindex = xmin; xindex <= xmax; xindex++) {
224  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
225  if (child_it.empty()) {
226  continue;
227  }
228  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
229  child_it.forward()) {
230  child = child_it.data();
231  if (child != outline && *child < *outline) {
232  child_count++;
233  if (child_count <= max_count) {
234  int max_grand =
235  (max_count - child_count) / edges_children_per_grandchild;
236  if (max_grand > 0) {
237  grandchild_count += count_children(child, max_grand) *
238  edges_children_per_grandchild;
239  } else {
240  grandchild_count += count_children(child, 1);
241  }
242  }
243  if (child_count + grandchild_count > max_count) {
244  if (edges_debug) {
245  tprintf("Discarding parent with child count=%d, gc=%d\n",
246  child_count, grandchild_count);
247  }
248  return child_count + grandchild_count;
249  }
250  if (parent_area == 0) {
251  parent_area = outline->outer_area();
252  if (parent_area < 0) {
253  parent_area = -parent_area;
254  }
255  max_parent_area = outline->bounding_box().area() * edges_boxarea;
256  if (parent_area < max_parent_area) {
257  parent_box = false;
258  }
259  }
260  if (parent_box &&
261  (!edges_children_fix ||
262  child->bounding_box().height() > edges_min_nonhole)) {
263  child_area = child->outer_area();
264  if (child_area < 0) {
265  child_area = -child_area;
266  }
267  if (edges_children_fix) {
268  if (parent_area - child_area < max_parent_area) {
269  parent_box = false;
270  continue;
271  }
272  if (grandchild_count > 0) {
273  if (edges_debug) {
274  tprintf(
275  "Discarding parent of area %d, child area=%d, max%g "
276  "with gc=%d\n",
277  parent_area, child_area, max_parent_area,
278  grandchild_count);
279  }
280  return max_count + 1;
281  }
282  child_length = child->pathlength();
283  if (child_length * child_length >
284  child_area * edges_patharea_ratio) {
285  if (edges_debug) {
286  tprintf(
287  "Discarding parent of area %d, child area=%d, max%g "
288  "with child length=%d\n",
289  parent_area, child_area, max_parent_area, child_length);
290  }
291  return max_count + 1;
292  }
293  }
294  if (child_area < child->bounding_box().area() * edges_childarea) {
295  if (edges_debug) {
296  tprintf(
297  "Discarding parent of area %d, child area=%d, max%g "
298  "with child rect=%d\n",
299  parent_area, child_area, max_parent_area,
300  child->bounding_box().area());
301  }
302  return max_count + 1;
303  }
304  }
305  }
306  }
307  }
308  }
309  return child_count + grandchild_count;
310 }
@ TBOX
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int16_t TDimension
Definition: tesstypes.h:32
TDimension y() const
access_function
Definition: points.h:62
TDimension x() const
access function
Definition: points.h:58
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:195

◆ extract_children()

void tesseract::OL_BUCKETS::extract_children ( C_OUTLINE outline,
C_OUTLINE_IT *  it 
)

Definition at line 318 of file edgblob.cpp.

321  {
322  TDimension xmin, xmax; // coord limits
323  TDimension ymin, ymax;
324  TBOX olbox;
325  C_OUTLINE_IT child_it; // search iterator
326 
327  olbox = outline->bounding_box();
328  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
329  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
330  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
331  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
332  for (auto yindex = ymin; yindex <= ymax; yindex++) {
333  for (auto xindex = xmin; xindex <= xmax; xindex++) {
334  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
335  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
336  child_it.forward()) {
337  if (*child_it.data() < *outline) {
338  it->add_after_then_move(child_it.extract());
339  }
340  }
341  }
342  }
343 }

◆ operator()()

C_OUTLINE_LIST * tesseract::OL_BUCKETS::operator() ( TDimension  x,
TDimension  y 
)

Definition at line 79 of file edgblob.cpp.

81  {
82  return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +
83  (x - bl.x()) / BUCKETSIZE];
84 }

◆ outline_complexity()

int32_t tesseract::OL_BUCKETS::outline_complexity ( C_OUTLINE outline,
int32_t  max_count,
int16_t  depth 
)

Definition at line 121 of file edgblob.cpp.

124  {
125  TDimension xmin, xmax; // coord limits
126  TDimension ymin, ymax;
127  C_OUTLINE *child; // current child
128  int32_t child_count; // no of children
129  int32_t grandchild_count; // no of grandchildren
130  C_OUTLINE_IT child_it; // search iterator
131 
132  TBOX olbox = outline->bounding_box();
133  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
134  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
135  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
136  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
137  child_count = 0;
138  grandchild_count = 0;
139  if (++depth > edges_max_children_layers) { // nested loops are too deep
140  return max_count + depth;
141  }
142 
143  for (auto yindex = ymin; yindex <= ymax; yindex++) {
144  for (auto xindex = xmin; xindex <= xmax; xindex++) {
145  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
146  if (child_it.empty()) {
147  continue;
148  }
149  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
150  child_it.forward()) {
151  child = child_it.data();
152  if (child == outline || !(*child < *outline)) {
153  continue;
154  }
155  child_count++;
156 
157  if (child_count > edges_max_children_per_outline) { // too fragmented
158  if (edges_debug) {
159  tprintf(
160  "Discard outline on child_count=%d > "
161  "max_children_per_outline=%d\n",
162  child_count,
163  static_cast<int32_t>(edges_max_children_per_outline));
164  }
165  return max_count + child_count;
166  }
167 
168  // Compute the "complexity" of each child recursively
169  int32_t remaining_count = max_count - child_count - grandchild_count;
170  if (remaining_count > 0) {
171  grandchild_count += edges_children_per_grandchild *
172  outline_complexity(child, remaining_count, depth);
173  }
174  if (child_count + grandchild_count > max_count) { // too complex
175  if (edges_debug) {
176  tprintf(
177  "Disgard outline on child_count=%d + grandchild_count=%d "
178  "> max_count=%d\n",
179  child_count, grandchild_count, max_count);
180  }
181  return child_count + grandchild_count;
182  }
183  }
184  }
185  }
186  return child_count + grandchild_count;
187 }
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:121

◆ scan_next()

C_OUTLINE_LIST * tesseract::OL_BUCKETS::scan_next ( )

Definition at line 90 of file edgblob.cpp.

90  {
91  return scan_next(it);
92 }
C_OUTLINE_LIST * scan_next()
Definition: edgblob.cpp:90

◆ start_scan()

C_OUTLINE_LIST * tesseract::OL_BUCKETS::start_scan ( )

Definition at line 86 of file edgblob.cpp.

86  {
87  return scan_next(buckets.begin());
88 }

The documentation for this class was generated from the following files: