tesseract  5.0.0
resultiterator_test.cc
Go to the documentation of this file.
1 
2 #include <allheaders.h>
3 #include <tesseract/baseapi.h>
5 #include <string>
6 #include "scrollview.h"
7 
8 #include "include_gunit.h"
9 #include "log.h" // for LOG
10 
11 namespace tesseract {
12 
13 // DEFINE_string(tess_config, "", "config file for tesseract");
14 // DEFINE_bool(visual_test, false, "Runs a visual test using scrollview");
15 
16 // The fixture for testing Tesseract.
17 class ResultIteratorTest : public testing::Test {
18 protected:
19  std::string TestDataNameToPath(const std::string &name) {
20  return file::JoinPath(TESTING_DIR, name);
21  }
22  std::string TessdataPath() {
23  return file::JoinPath(TESSDATA_DIR, "");
24  }
25  std::string OutputNameToPath(const std::string &name) {
27  return file::JoinPath(FLAGS_test_tmpdir, name);
28  }
29 
31  src_pix_ = nullptr;
32  }
33  ~ResultIteratorTest() override = default;
34 
35  void SetImage(const char *filename) {
36  src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
38  // if (!FLAGS_tess_config.empty())
39  // api_.ReadConfigFile(FLAGS_tess_config.c_str());
42  src_pix_.destroy();
44  }
45 
46  // Rebuilds the image using the binary images at the given level, and
47  // EXPECTs that the number of pixels in the xor of the rebuilt image with
48  // the original is at most max_diff.
49  void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator *it) {
50  it->Begin();
51  int width = pixGetWidth(src_pix_);
52  int height = pixGetHeight(src_pix_);
53  int depth = pixGetDepth(src_pix_);
54  Image pix = pixCreate(width, height, depth);
55  EXPECT_TRUE(depth == 1 || depth == 8);
56  if (depth == 8) {
57  pixSetAll(pix);
58  }
59  do {
60  int left, top, right, bottom;
61  PageIteratorLevel im_level = level;
62  // If the return is false, it is a non-text block so get the block image.
63  if (!it->BoundingBox(level, &left, &top, &right, &bottom)) {
64  im_level = tesseract::RIL_BLOCK;
65  EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));
66  }
67  LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right << ", B:" << bottom
68  << "]"
69  << "\n";
70  Image block_pix;
71  if (depth == 1) {
72  block_pix = it->GetBinaryImage(im_level);
73  pixRasterop(pix, left, top, right - left, bottom - top, PIX_SRC ^ PIX_DST, block_pix, 0, 0);
74  } else {
75  block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);
76  pixRasterop(pix, left, top, pixGetWidth(block_pix), pixGetHeight(block_pix),
77  PIX_SRC & PIX_DST, block_pix, 0, 0);
78  }
79  CHECK(block_pix != nullptr);
80  block_pix.destroy();
81  } while (it->Next(level));
82  // if (base::GetFlag(FLAGS_v) >= 1)
83  // pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG);
84  pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0);
85  if (depth == 8) {
86  Image binary_pix = pixThresholdToBinary(pix, 128);
87  pix.destroy();
88  pixInvert(binary_pix, binary_pix);
89  pix = binary_pix;
90  }
91  // if (base::GetFlag(FLAGS_v) >= 1)
92  // pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG);
93  l_int32 pixcount;
94  pixCountPixels(pix, &pixcount, nullptr);
95  if (pixcount > max_diff) {
96  std::string outfile = OutputNameToPath("failedxor.png");
97  LOG(INFO) << "outfile = " << outfile << "\n";
98  pixWrite(outfile.c_str(), pix, IFF_PNG);
99  }
100  pix.destroy();
101  LOG(INFO) << "At level " << level << ": pix diff = " << pixcount << "\n";
102  EXPECT_LE(pixcount, max_diff);
103  // if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff);
104  }
105 
106  // Rebuilds the text from the iterator strings at the given level, and
107  // EXPECTs that the rebuild string exactly matches the truth string.
108  void VerifyIteratorText(const std::string &truth, PageIteratorLevel level, ResultIterator *it) {
109  LOG(INFO) << "Text Test Level " << level << "\n";
110  it->Begin();
111  std::string result;
112  do {
113  char *text = it->GetUTF8Text(level);
114  result += text;
115  delete[] text;
116  if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) &&
118  if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) {
119  result += '\n';
120  } else {
121  result += ' ';
122  }
123  if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&
124  !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) {
125  result += '\n';
126  }
127  }
128  } while (it->Next(level));
129  EXPECT_STREQ(truth.c_str(), result.c_str()) << "Rebuild failed at Text Level " << level;
130  }
131 
132  void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_limit,
133  int symbol_limit, PageIterator *it, PageIteratorLevel maxlevel=tesseract::RIL_SYMBOL) {
134  VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
135  VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
136  VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
137  VerifyRebuild(word_limit, tesseract::RIL_WORD, it);
138  if (maxlevel == tesseract::RIL_SYMBOL) {
139  VerifyRebuild(symbol_limit, maxlevel, it);
140  }
141  }
142 
143  void VerifyAllText(const std::string &truth, ResultIterator *it) {
149  }
150 
151  // Verifies that ResultIterator::CalculateTextlineOrder() produces the right
152  // results given an array of word directions (word_dirs[num_words]), an
153  // expected output reading order
154  // (expected_reading_order[num_reading_order_entries]) and a given reading
155  // context (ltr or rtl).
156  void ExpectTextlineReadingOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,
157  int num_words, int *expected_reading_order,
158  int num_reading_order_entries) const {
159  std::vector<StrongScriptDirection> gv_word_dirs;
160  for (int i = 0; i < num_words; i++) {
161  gv_word_dirs.push_back(word_dirs[i]);
162  }
163 
164  std::vector<int> calculated_order;
165  ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &calculated_order);
166  // STL vector can be used with EXPECT_EQ, so convert...
167  std::vector<int> correct_order(expected_reading_order,
168  expected_reading_order + num_reading_order_entries);
169  EXPECT_EQ(correct_order, calculated_order);
170  }
171 
172  // Verify that ResultIterator::CalculateTextlineOrder() produces sane output
173  // for a given array of word_dirs[num_words] in ltr or rtl context.
174  // Sane means that the output contains some permutation of the indices
175  // 0..[num_words - 1] interspersed optionally with negative (marker) values.
176  void VerifySaneTextlineOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,
177  int num_words) const {
178  std::vector<StrongScriptDirection> gv_word_dirs;
179  for (int i = 0; i < num_words; i++) {
180  gv_word_dirs.push_back(word_dirs[i]);
181  }
182 
183  std::vector<int> output;
184  ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output);
185  ASSERT_GE(output.size(), num_words);
186  std::vector<int> output_copy(output);
187  std::sort(output_copy.begin(), output_copy.end());
188  bool sane = true;
189  unsigned j = 0;
190  while (j < output_copy.size() && output_copy[j] < 0) {
191  j++;
192  }
193  for (int i = 0; i < num_words; i++, j++) {
194  if (output_copy[j] != i) {
195  sane = false;
196  break;
197  }
198  }
199  if (j != output_copy.size()) {
200  sane = false;
201  }
202  if (!sane) {
203  std::vector<int> empty;
204  EXPECT_EQ(output, empty) << " permutation of 0.." << num_words - 1 << " not found in "
205  << (in_ltr_context ? "ltr" : "rtl") << " context.";
206  }
207  }
208 
209  // Objects declared here can be used by all tests in the test case for Foo.
210  Image src_pix_; // Borrowed from api_. Do not destroy.
211  std::string ocr_text_;
213 };
214 
215 // Tests layout analysis output (and scrollview) on the UNLV page numbered
216 // 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true.
217 //
218 // TEST_F(ResultIteratorTest, VisualTest) {
219 // if (!FLAGS_visual_test) return;
220 // const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr};
221 // for (int i = 0; kIms[i] != nullptr; ++i) {
222 // SetImage(kIms[i]);
223 // // Just run layout analysis.
224 // PageIterator* it = api_.AnalyseLayout();
225 // EXPECT_FALSE(it == nullptr);
226 // // Make a scrollview window for the display.
227 // int width = pixGetWidth(src_pix_);
228 // int height = pixGetHeight(src_pix_);
229 // ScrollView* win =
230 // new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height);
231 // win->Image(src_pix_, 0, 0);
232 // it->Begin();
233 // ScrollView::Color color = ScrollView::RED;
234 // win->Brush(ScrollView::NONE);
235 // do {
236 // Pta* pts = it->BlockPolygon();
237 // if (pts != nullptr) {
238 // win->Pen(color);
239 // int num_pts = ptaGetCount(pts);
240 // l_float32 x, y;
241 // ptaGetPt(pts, num_pts - 1, &x, &y);
242 // win->SetCursor(static_cast<int>(x), static_cast<int>(y));
243 // for (int p = 0; p < num_pts; ++p) {
244 // ptaGetPt(pts, p, &x, &y);
245 // win->DrawTo(static_cast<int>(x), static_cast<int>(y));
246 // }
247 // }
248 // ptaDestroy(&pts);
249 // } while (it->Next(tesseract::RIL_BLOCK));
250 // win->Update();
251 // delete win->AwaitEvent(SVET_DESTROY);
252 // delete win;
253 // delete it;
254 // }
255 //}
256 
257 // Tests that Tesseract gets exactly the right answer on phototest.
259  SetImage("phototest.tif");
260  // Just run layout analysis.
261  PageIterator *p_it = api_.AnalyseLayout();
262  EXPECT_FALSE(p_it == nullptr);
263  // Check iterator position.
264  EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
265  // This should be a single block.
266  EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK));
267  EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
268 
269  // The images should rebuild almost perfectly.
270  LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)"
271  << "\n";
272  VerifyRebuilds(10, 10, 0, 0, 0, p_it);
273  delete p_it;
274 
275  char *result = api_.GetUTF8Text();
276  ocr_text_ = result;
277  delete[] result;
278  ResultIterator *r_it = api_.GetIterator();
279  // The images should rebuild almost perfectly.
280  LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)"
281  << "\n";
282  VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);
283  // Test the text.
284  LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)"
285  << "\n";
286  VerifyAllText(ocr_text_, r_it);
287 
288  // The images should rebuild almost perfectly.
289  LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)"
290  << "\n";
291  VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);
292 
293  r_it->Begin();
294  // Test baseline of the first line.
295  int x1, y1, x2, y2;
296  r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
297  LOG(INFO) << "Baseline ("
298  << x1 << ',' << y1 << ")->(" << x2 << ',' << y2 << ")\n";
299  // Make sure we have a decent vector.
300  EXPECT_GE(x2, x1 + 400);
301  // The point 200,116 should be very close to the baseline.
302  // (x3,y3) is the vector from (x1,y1) to (200,116)
303  int x3 = 200 - x1;
304  int y3 = 116 - y1;
305  x2 -= x1;
306  y2 -= y1;
307  // The cross product (x2,y1)x(x3,y3) should be small.
308  int product = x2 * y3 - x3 * y2;
309  EXPECT_LE(abs(product), x2);
310 
311  // Test font attributes for each word.
312  do {
313  float confidence = r_it->Confidence(tesseract::RIL_WORD);
314 #ifndef DISABLED_LEGACY_ENGINE
315  int pointsize, font_id;
316  bool bold, italic, underlined, monospace, serif, smallcaps;
317  const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
318  &smallcaps, &pointsize, &font_id);
319  EXPECT_GE(confidence, 80.0f);
320 #endif
321  char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
322 
323 #ifdef DISABLED_LEGACY_ENGINE
324  LOG(INFO) << "Word " << word_str << ", conf " << confidence << "\n";
325 #else
326  LOG(INFO) << "Word " << word_str << " in font " << font
327  << ", id " << font_id << ", size " << pointsize
328  << ", conf " << confidence << "\n";
329 #endif // def DISABLED_LEGACY_ENGINE
330  delete[] word_str;
331 #ifndef DISABLED_LEGACY_ENGINE
332  EXPECT_FALSE(bold);
333  EXPECT_FALSE(italic);
334  EXPECT_FALSE(underlined);
335  EXPECT_FALSE(monospace);
336  EXPECT_FALSE(serif);
337  // The text is about 31 pixels high. Above we say the source is 200 ppi,
338  // which translates to:
339  // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts
340  EXPECT_GE(pointsize, 11.16 - 1.50);
341  EXPECT_LE(pointsize, 11.16 + 1.50);
342 #endif // def DISABLED_LEGACY_ENGINE
343  } while (r_it->Next(tesseract::RIL_WORD));
344  delete r_it;
345 }
346 
347 // Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
348 TEST_F(ResultIteratorTest, ComplexTest) {
349  SetImage("8087_054.3B.tif");
350  // Just run layout analysis.
351  PageIterator *it = api_.AnalyseLayout();
352  EXPECT_FALSE(it == nullptr);
353  // The images should rebuild almost perfectly.
354  VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it);
355  delete it;
356 }
357 
358 // Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik)
360  SetImage("8087_054.3G.tif");
361  // Just run layout analysis.
362  PageIterator *it = api_.AnalyseLayout();
363  EXPECT_FALSE(it == nullptr);
364  // The images should rebuild almost perfectly.
365  VerifyRebuilds(600, 600, 600, 600, 600, it);
366  delete it;
367 }
368 
369 // Tests that Tesseract gets smallcaps and dropcaps.
370 TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
371 #ifdef DISABLED_LEGACY_ENGINE
372  // Skip test as LSTM mode does not recognize smallcaps & dropcaps attributes.
373  GTEST_SKIP();
374 #else
375  SetImage("8071_093.3B.tif");
376  char *result = api_.GetUTF8Text();
377  delete[] result;
378  ResultIterator *r_it = api_.GetIterator();
379  // Iterate over the words.
380  int found_dropcaps = 0;
381  int found_smallcaps = 0;
382  int false_positives = 0;
383  do {
384  bool bold, italic, underlined, monospace, serif, smallcaps;
385  int pointsize, font_id;
386  r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
387  &pointsize, &font_id);
388  char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
389  if (word_str != nullptr) {
390  LOG(INFO) << "Word " << word_str
391  << " is " << (smallcaps ? "SMALLCAPS" : "Normal") << "\n";
392  if (r_it->SymbolIsDropcap()) {
393  ++found_dropcaps;
394  }
395  if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 ||
396  strcmp(word_str, "RALPH") == 0 || strcmp(word_str, "KINNEY") == 0 || // Not working yet.
397  strcmp(word_str, "BENNETT") == 0) {
398  EXPECT_TRUE(smallcaps) << word_str;
399  ++found_smallcaps;
400  } else {
401  if (smallcaps) {
402  ++false_positives;
403  }
404  }
405  // No symbol other than the first of any word should be dropcap.
406  ResultIterator s_it(*r_it);
408  if (s_it.SymbolIsDropcap()) {
409  char *sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
410  LOG(ERROR) << "Symbol " << sym_str << " of word " << word_str << " is dropcap";
411  delete[] sym_str;
412  }
413  EXPECT_FALSE(s_it.SymbolIsDropcap());
414  }
415  delete[] word_str;
416  }
417  } while (r_it->Next(tesseract::RIL_WORD));
418  delete r_it;
419  EXPECT_EQ(1, found_dropcaps);
420  EXPECT_GE(4, found_smallcaps);
421  EXPECT_LE(false_positives, 3);
422 #endif // DISABLED_LEGACY_ENGINE
423 }
424 
425 #if 0
426 // TODO(rays) uncomment on the next change to layout analysis.
427 // CL 22736106 breaks it, but it is fixed in the change when
428 // the textline finders start to collapse.
429 
430 // Tests that Tesseract gets subscript and superscript.
431 // TODO(rays) This test is a bit feeble, due to bad textline finding on this
432 // image, so beef up the test a bit when we get less false positive subs.
433 TEST_F(ResultIteratorTest, SubSuperTest) {
434  SetImage("0146_281.3B.tif");
435  char* result = api_.GetUTF8Text();
436  delete [] result;
437  ResultIterator* r_it = api_.GetIterator();
438  // Iterate over the symbols.
439  // Accuracy isn't great, so just count up and expect a decent count of
440  // positives and negatives.
441  const char kAllowedSupers[] = "O0123456789-";
442  int found_subs = 0;
443  int found_supers = 0;
444  int found_normal = 0;
445  do {
446  if (r_it->SymbolIsSubscript()) {
447  ++found_subs;
448  } else if (r_it->SymbolIsSuperscript()) {
449  result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL);
450  if (strchr(kAllowedSupers, result[0]) == nullptr) {
451  char* word = r_it->GetUTF8Text(tesseract::RIL_WORD);
452  LOG(ERROR) << "Char " << result << " in word " << word << " is unexpected super!";
453  delete [] word;
454  EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr);
455  }
456  delete [] result;
457  ++found_supers;
458  } else {
459  ++found_normal;
460  }
461  } while (r_it->Next(tesseract::RIL_SYMBOL));
462  delete r_it;
463  LOG(INFO) << "Subs = " << found_subs << ", supers= " << found_supers
464  << ", normal = " << found_normal << "\n";
465  EXPECT_GE(found_subs, 25);
466  EXPECT_GE(found_supers, 25);
467  EXPECT_GE(found_normal, 1350);
468 }
469 #endif
470 
471 static const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT;
472 static const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT;
473 static const StrongScriptDirection dN = DIR_NEUTRAL;
474 
475 // Test that a sequence of words that could be interpreted to start from
476 // the left side left-to-right or from the right side right-to-left is
477 // interpreted appropriately in different contexts.
478 TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {
479  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};
480  int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,
481  0, 1, 2, 3, ResultIterator::kMinorRunEnd};
482  int reading_order_ltr_context[] = {
484 
485  ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,
486  countof(reading_order_ltr_context));
487  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
488  countof(reading_order_rtl_context));
489 }
490 
491 // Tests that clearly left-direction text (with no right-to-left indications)
492 // comes out strictly left to right no matter the context.
493 TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {
494  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL};
495  // The order here is just left to right, nothing fancy.
496  int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};
497  // In the strange event that this shows up in an RTL paragraph, nonetheless
498  // just presume the whole thing is an LTR line.
499  int reading_order_rtl_context[] = {ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
501 
502  ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,
503  countof(reading_order_ltr_context));
504  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
505  countof(reading_order_rtl_context));
506 }
507 
508 // Test that right-direction text comes out strictly right-to-left in
509 // a right-to-left context.
510 TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {
511  const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};
512  // The order here is just right-to-left, nothing fancy.
513  int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};
514  ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
515  countof(reading_order_rtl_context));
516 }
517 
518 TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) {
519  // Iterate through all 7-word sequences and make sure that the output
520  // contains each of the indices 0..6 exactly once.
521  const int kNumWords(7);
522  const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations
523  StrongScriptDirection word_dirs[kNumWords];
524  for (int i = 0; i < kNumCombos; i++) {
525  // generate the next combination.
526  int tmp = i;
527  for (auto &word_dir : word_dirs) {
528  word_dir = static_cast<StrongScriptDirection>(tmp % 4);
529  tmp = tmp / 4;
530  }
531  VerifySaneTextlineOrder(true, word_dirs, kNumWords);
532  VerifySaneTextlineOrder(false, word_dirs, kNumWords);
533  }
534 }
535 
536 // TODO: Missing image
537 TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {
538  SetImage("5318c4b679264.jpg");
539  char *result = api_.GetUTF8Text();
540  delete[] result;
541  ResultIterator *r_it = api_.GetIterator();
542  // Iterate over the words.
543  do {
544  char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
545  if (word_str != nullptr) {
546  LOG(INFO) << "Word " << word_str << ":\n";
547  ResultIterator s_it = *r_it;
548  do {
549  tesseract::ChoiceIterator c_it(s_it);
550  do {
551  const char *char_str = c_it.GetUTF8Text();
552  if (char_str == nullptr) {
553  LOG(INFO) << "Null char choice"
554  << "\n";
555  } else {
556  LOG(INFO) << "Char choice " << char_str << "\n";
557  }
558  CHECK(char_str != nullptr);
559  } while (c_it.Next());
562  delete[] word_str;
563  }
564  } while (r_it->Next(tesseract::RIL_WORD));
565  delete r_it;
566 }
567 
568 // TODO: Missing image
569 TEST_F(ResultIteratorTest, NonNullConfidencesTest) {
570  // SetImage("line6.tiff");
571  SetImage("trainingitalline.tif");
572  api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
573  // Force recognition so we can used the result iterator.
574  // We don't care about the return from GetUTF8Text.
575  char *result = api_.GetUTF8Text();
576  delete[] result;
577  ResultIterator *r_it = api_.GetIterator();
578  // Iterate over the words.
579  do {
580  char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
581  if (word_str != nullptr) {
582  EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD));
583  EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL));
584  ResultIterator s_it = *r_it;
585  do {
586  const char *char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
587  CHECK(char_str != nullptr);
588  float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);
589  LOG(INFO) << "Char " << char_str << " has confidence " << confidence << "\n";
590  delete[] char_str;
593  delete[] word_str;
594  } else {
595  LOG(INFO) << "Empty word found"
596  << "\n";
597  }
598  } while (r_it->Next(tesseract::RIL_WORD));
599  delete r_it;
600 }
601 
602 } // namespace tesseract
@ LOG
#define CHECK(condition)
Definition: include_gunit.h:76
@ ERROR
Definition: log.h:28
@ INFO
Definition: log.h:28
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
@ PSM_AUTO
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:164
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
StrongScriptDirection
Definition: unichar.h:43
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
@ DIR_NEUTRAL
Definition: unichar.h:44
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:42
TEST_F(EuroText, FastLatinOCR)
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:508
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:365
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:573
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
float Confidence(PageIteratorLevel level) const
const char * GetUTF8Text() const
virtual bool Next(PageIteratorLevel level)
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
Pix * GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
bool Empty(PageIteratorLevel level) const
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
Pix * GetBinaryImage(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
static const int kMinorRunEnd
static const int kMinorRunStart
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override
void destroy()
Definition: image.cpp:32
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
void VerifyAllText(const std::string &truth, ResultIterator *it)
std::string OutputNameToPath(const std::string &name)
void VerifyIteratorText(const std::string &truth, PageIteratorLevel level, ResultIterator *it)
std::string TestDataNameToPath(const std::string &name)
void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator *it)
void ExpectTextlineReadingOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs, int num_words, int *expected_reading_order, int num_reading_order_entries) const
void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_limit, int symbol_limit, PageIterator *it, PageIteratorLevel maxlevel=tesseract::RIL_SYMBOL)
void VerifySaneTextlineOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs, int num_words) const
~ResultIteratorTest() override=default
void SetImage(const char *filename)