tesseract  5.0.0
tesseract::PAGE_RES_IT Class Reference

#include <pageres.h>

Public Member Functions

 PAGE_RES_IT ()=default
 
 PAGE_RES_IT (PAGE_RES *the_page_res)
 
bool operator== (const PAGE_RES_IT &other) const
 
bool operator!= (const PAGE_RES_IT &other) const
 
int cmp (const PAGE_RES_IT &other) const
 
WERD_RESrestart_page ()
 
WERD_RESrestart_page_with_empties ()
 
WERD_RESstart_page (bool empty_ok)
 
WERD_RESrestart_row ()
 
WERD_RESInsertSimpleCloneWord (const WERD_RES &clone_res, WERD *new_word)
 
void ReplaceCurrentWord (PointerVector< WERD_RES > *words)
 
void DeleteCurrentWord ()
 
void MakeCurrentWordFuzzy ()
 
WERD_RESforward ()
 
WERD_RESforward_with_empties ()
 
WERD_RESforward_paragraph ()
 
WERD_RESforward_block ()
 
WERD_RESprev_word () const
 
ROW_RESprev_row () const
 
BLOCK_RESprev_block () const
 
WERD_RESword () const
 
ROW_RESrow () const
 
BLOCK_RESblock () const
 
WERD_RESnext_word () const
 
ROW_RESnext_row () const
 
BLOCK_RESnext_block () const
 
void rej_stat_word ()
 
void ResetWordIterator ()
 

Public Attributes

PAGE_RESpage_res
 

Detailed Description

Definition at line 682 of file pageres.h.

Constructor & Destructor Documentation

◆ PAGE_RES_IT() [1/2]

tesseract::PAGE_RES_IT::PAGE_RES_IT ( )
default

◆ PAGE_RES_IT() [2/2]

tesseract::PAGE_RES_IT::PAGE_RES_IT ( PAGE_RES the_page_res)
inline

Definition at line 688 of file pageres.h.

688  { // page result
689  page_res = the_page_res;
690  restart_page(); // ready to scan
691  }
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * restart_page()
Definition: pageres.h:710

Member Function Documentation

◆ block()

BLOCK_RES* tesseract::PAGE_RES_IT::block ( ) const
inline

Definition at line 769 of file pageres.h.

769  { // block of cur. word
770  return block_res;
771  }

◆ cmp()

int tesseract::PAGE_RES_IT::cmp ( const PAGE_RES_IT other) const

Definition at line 1183 of file pageres.cpp.

1183  {
1184  ASSERT_HOST(page_res == other.page_res);
1185  if (other.block_res == nullptr) {
1186  // other points to the end of the page.
1187  if (block_res == nullptr) {
1188  return 0;
1189  }
1190  return -1;
1191  }
1192  if (block_res == nullptr) {
1193  return 1; // we point to the end of the page.
1194  }
1195  if (block_res == other.block_res) {
1196  if (other.row_res == nullptr || row_res == nullptr) {
1197  // this should only happen if we hit an image block.
1198  return 0;
1199  }
1200  if (row_res == other.row_res) {
1201  // we point to the same block and row.
1202  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1203  if (word_res == other.word_res) {
1204  // we point to the same word!
1205  return 0;
1206  }
1207 
1208  WERD_RES_IT word_res_it(&row_res->word_res_list);
1209  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1210  word_res_it.forward()) {
1211  if (word_res_it.data() == word_res) {
1212  return -1;
1213  } else if (word_res_it.data() == other.word_res) {
1214  return 1;
1215  }
1216  }
1217  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1218  }
1219 
1220  // we both point to the same block, but different rows.
1221  ROW_RES_IT row_res_it(&block_res->row_res_list);
1222  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1223  row_res_it.forward()) {
1224  if (row_res_it.data() == row_res) {
1225  return -1;
1226  } else if (row_res_it.data() == other.row_res) {
1227  return 1;
1228  }
1229  }
1230  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1231  }
1232 
1233  // We point to different blocks.
1234  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1235  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
1236  block_res_it.forward()) {
1237  if (block_res_it.data() == block_res) {
1238  return -1;
1239  } else if (block_res_it.data() == other.block_res) {
1240  return 1;
1241  }
1242  }
1243  // Shouldn't happen...
1244  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1245  return 0;
1246 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
ROW_RES_LIST row_res_list
Definition: pageres.h:129
WERD_RES_LIST word_res_list
Definition: pageres.h:148

◆ DeleteCurrentWord()

void tesseract::PAGE_RES_IT::DeleteCurrentWord ( )

Definition at line 1488 of file pageres.cpp.

1488  {
1489  // Check that this word is as we expect. part_of_combos are NEVER iterated
1490  // by the normal iterator, so we should never be trying to delete them.
1491  ASSERT_HOST(!word_res->part_of_combo);
1492  if (!word_res->combination) {
1493  // Combinations own their own word, so we won't find the word on the
1494  // row's word_list, but it is legitimate to try to delete them.
1495  // Delete word from the ROW when not a combination.
1496  WERD_IT w_it(row()->row->word_list());
1497  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1498  if (w_it.data() == word_res->word) {
1499  break;
1500  }
1501  }
1502  ASSERT_HOST(!w_it.cycled_list());
1503  delete w_it.extract();
1504  }
1505  // Remove the WERD_RES for the new_word.
1506  // Remove the WORD_RES from the ROW_RES.
1507  WERD_RES_IT wr_it(&row()->word_res_list);
1508  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1509  if (wr_it.data() == word_res) {
1510  word_res = nullptr;
1511  break;
1512  }
1513  }
1514  ASSERT_HOST(!wr_it.cycled_list());
1515  delete wr_it.extract();
1517 }
ROW_RES * row() const
Definition: pageres.h:766

◆ forward()

WERD_RES* tesseract::PAGE_RES_IT::forward ( )
inline

Definition at line 743 of file pageres.h.

743  { // Get next word.
744  return internal_forward(false, false);
745  }

◆ forward_block()

WERD_RES * tesseract::PAGE_RES_IT::forward_block ( )

Definition at line 1715 of file pageres.cpp.

1715  {
1716  while (block_res == next_block_res) {
1717  internal_forward(false, true);
1718  }
1719  return internal_forward(false, true);
1720 }

◆ forward_paragraph()

WERD_RES * tesseract::PAGE_RES_IT::forward_paragraph ( )

Definition at line 1700 of file pageres.cpp.

1700  {
1701  while (block_res == next_block_res &&
1702  (next_row_res != nullptr && next_row_res->row != nullptr &&
1703  row_res->row->para() == next_row_res->row->para())) {
1704  internal_forward(false, true);
1705  }
1706  return internal_forward(false, true);
1707 }
PARA * para() const
Definition: ocrrow.h:120

◆ forward_with_empties()

WERD_RES* tesseract::PAGE_RES_IT::forward_with_empties ( )
inline

Definition at line 747 of file pageres.h.

747  {
748  return internal_forward(false, true);
749  }

◆ InsertSimpleCloneWord()

WERD_RES * tesseract::PAGE_RES_IT::InsertSimpleCloneWord ( const WERD_RES clone_res,
WERD new_word 
)

Definition at line 1252 of file pageres.cpp.

1253  {
1254  // Make a WERD_RES for the new_word.
1255  auto *new_res = new WERD_RES(new_word);
1256  new_res->CopySimpleFields(clone_res);
1257  new_res->combination = true;
1258  // Insert into the appropriate place in the ROW_RES.
1259  WERD_RES_IT wr_it(&row()->word_res_list);
1260  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1261  WERD_RES *word = wr_it.data();
1262  if (word == word_res) {
1263  break;
1264  }
1265  }
1266  ASSERT_HOST(!wr_it.cycled_list());
1267  wr_it.add_before_then_move(new_res);
1268  if (wr_it.at_first()) {
1269  // This is the new first word, so reset the member iterator so it
1270  // detects the cycled_list state correctly.
1272  }
1273  return new_res;
1274 }
WERD_RES * word() const
Definition: pageres.h:763

◆ MakeCurrentWordFuzzy()

void tesseract::PAGE_RES_IT::MakeCurrentWordFuzzy ( )

Definition at line 1521 of file pageres.cpp.

1521  {
1522  WERD *real_word = word_res->word;
1523  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1524  real_word->set_flag(W_FUZZY_SP, true);
1525  if (word_res->combination) {
1526  // The next word should be the corresponding part of combo, but we have
1527  // already stepped past it, so find it by search.
1528  WERD_RES_IT wr_it(&row()->word_res_list);
1529  for (wr_it.mark_cycle_pt();
1530  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1531  }
1532  wr_it.forward();
1533  ASSERT_HOST(wr_it.data()->part_of_combo);
1534  real_word = wr_it.data()->word;
1535  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1536  !real_word->flag(W_FUZZY_NON));
1537  real_word->set_flag(W_FUZZY_SP, true);
1538  }
1539  }
1540 }
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131

◆ next_block()

BLOCK_RES* tesseract::PAGE_RES_IT::next_block ( ) const
inline

Definition at line 778 of file pageres.h.

778  { // block of next word
779  return next_block_res;
780  }

◆ next_row()

ROW_RES* tesseract::PAGE_RES_IT::next_row ( ) const
inline

Definition at line 775 of file pageres.h.

775  { // row of next word
776  return next_row_res;
777  }

◆ next_word()

WERD_RES* tesseract::PAGE_RES_IT::next_word ( ) const
inline

Definition at line 772 of file pageres.h.

772  { // next word
773  return next_word_res;
774  }

◆ operator!=()

bool tesseract::PAGE_RES_IT::operator!= ( const PAGE_RES_IT other) const
inline

Definition at line 700 of file pageres.h.

700  {
701  return !(*this == other);
702  }

◆ operator==()

bool tesseract::PAGE_RES_IT::operator== ( const PAGE_RES_IT other) const
inline

Definition at line 695 of file pageres.h.

695  {
696  return word_res == other.word_res && row_res == other.row_res &&
697  block_res == other.block_res;
698  }

◆ prev_block()

BLOCK_RES* tesseract::PAGE_RES_IT::prev_block ( ) const
inline

Definition at line 760 of file pageres.h.

760  { // block of prev word
761  return prev_block_res;
762  }

◆ prev_row()

ROW_RES* tesseract::PAGE_RES_IT::prev_row ( ) const
inline

Definition at line 757 of file pageres.h.

757  { // row of prev word
758  return prev_row_res;
759  }

◆ prev_word()

WERD_RES* tesseract::PAGE_RES_IT::prev_word ( ) const
inline

Definition at line 754 of file pageres.h.

754  { // previous word
755  return prev_word_res;
756  }

◆ rej_stat_word()

void tesseract::PAGE_RES_IT::rej_stat_word ( )

Definition at line 1722 of file pageres.cpp.

1722  {
1723  int16_t chars_in_word;
1724  int16_t rejects_in_word = 0;
1725 
1726  chars_in_word = word_res->reject_map.length();
1727  page_res->char_count += chars_in_word;
1728  block_res->char_count += chars_in_word;
1729  row_res->char_count += chars_in_word;
1730 
1731  rejects_in_word = word_res->reject_map.reject_count();
1732 
1733  page_res->rej_count += rejects_in_word;
1734  block_res->rej_count += rejects_in_word;
1735  row_res->rej_count += rejects_in_word;
1736  if (chars_in_word == rejects_in_word) {
1737  row_res->whole_word_rej_count += rejects_in_word;
1738  }
1739 }
int32_t rej_count
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:121
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
int16_t reject_count() const
Definition: rejctmap.h:339
uint16_t length() const
Definition: rejctmap.h:333

◆ ReplaceCurrentWord()

void tesseract::PAGE_RES_IT::ReplaceCurrentWord ( tesseract::PointerVector< WERD_RES > *  words)

Definition at line 1378 of file pageres.cpp.

1379  {
1380  if (words->empty()) {
1382  return;
1383  }
1384  WERD_RES *input_word = word();
1385  // Set the BOL/EOL flags on the words from the input word.
1386  if (input_word->word->flag(W_BOL)) {
1387  (*words)[0]->word->set_flag(W_BOL, true);
1388  } else {
1389  (*words)[0]->word->set_blanks(input_word->word->space());
1390  }
1391  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1392 
1393  // Move the blobs from the input word to the new set of words.
1394  // If the input word_res is a combination, then the replacements will also be
1395  // combinations, and will own their own words. If the input word_res is not a
1396  // combination, then the final replacements will not be either, (although it
1397  // is allowed for the input words to be combinations) and their words
1398  // will get put on the row list. This maintains the ownership rules.
1399  WERD_IT w_it(row()->row->word_list());
1400  if (!input_word->combination) {
1401  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1402  WERD *word = w_it.data();
1403  if (word == input_word->word) {
1404  break;
1405  }
1406  }
1407  // w_it is now set to the input_word's word.
1408  ASSERT_HOST(!w_it.cycled_list());
1409  }
1410  // Insert into the appropriate place in the ROW_RES.
1411  WERD_RES_IT wr_it(&row()->word_res_list);
1412  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1413  WERD_RES *word = wr_it.data();
1414  if (word == input_word) {
1415  break;
1416  }
1417  }
1418  ASSERT_HOST(!wr_it.cycled_list());
1419  // Since we only have an estimate of the bounds between blobs, use the blob
1420  // x-middle as the determiner of where to put the blobs
1421  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422  src_b_it.sort(&C_BLOB::SortByXMiddle);
1423  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425  TBOX clip_box;
1426  for (size_t w = 0; w < words->size(); ++w) {
1427  WERD_RES *word_w = (*words)[w];
1428  clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429  // Compute blob boundaries.
1430  std::vector<int> blob_ends;
1431  C_BLOB_LIST *next_word_blobs =
1432  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433  ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434  // Remove the fake blobs on the current word, but keep safe for back-up if
1435  // no blob can be found.
1436  C_BLOB_LIST fake_blobs;
1437  C_BLOB_IT fake_b_it(&fake_blobs);
1438  fake_b_it.add_list_after(word_w->word->cblob_list());
1439  fake_b_it.move_to_first();
1440  word_w->word->cblob_list()->clear();
1441  C_BLOB_IT dest_it(word_w->word->cblob_list());
1442  // Build the box word as we move the blobs.
1443  auto *box_word = new tesseract::BoxWord;
1444  for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445  int end_x = blob_ends[i];
1446  TBOX blob_box;
1447  // Add the blobs up to end_x.
1448  while (!src_b_it.empty() &&
1449  src_b_it.data()->bounding_box().x_middle() < end_x) {
1450  blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451  src_b_it.forward();
1452  }
1453  while (!rej_b_it.empty() &&
1454  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455  blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456  rej_b_it.forward();
1457  }
1458  if (blob_box.null_box()) {
1459  // Use the original box as a back-up.
1460  blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461  }
1462  box_word->InsertBox(i, blob_box);
1463  }
1464  delete word_w->box_word;
1465  word_w->box_word = box_word;
1466  if (!input_word->combination) {
1467  // Insert word_w->word into the ROW. It doesn't own its word, so the
1468  // ROW needs to own it.
1469  w_it.add_before_stay_put(word_w->word);
1470  word_w->combination = false;
1471  }
1472  (*words)[w] = nullptr; // We are taking ownership.
1473  wr_it.add_before_stay_put(word_w);
1474  }
1475  // We have taken ownership of the words.
1476  words->clear();
1477  // Delete the current word, which has been replaced. We could just call
1478  // DeleteCurrentWord, but that would iterate both lists again, and we know
1479  // we are already in the right place.
1480  if (!input_word->combination) {
1481  delete w_it.extract();
1482  }
1483  delete wr_it.extract();
1485 }
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
unsigned size() const
Definition: genericvector.h:74

◆ ResetWordIterator()

void tesseract::PAGE_RES_IT::ResetWordIterator ( )

Definition at line 1571 of file pageres.cpp.

1571  {
1572  if (row_res == next_row_res) {
1573  // Reset the member iterator so it can move forward and detect the
1574  // cycled_list state correctly.
1575  word_res_it.move_to_first();
1576  for (word_res_it.mark_cycle_pt();
1577  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1578  word_res_it.forward()) {
1579  if (!word_res_it.data()->part_of_combo) {
1580  if (prev_row_res == row_res) {
1581  prev_word_res = word_res;
1582  }
1583  word_res = word_res_it.data();
1584  }
1585  }
1586  ASSERT_HOST(!word_res_it.cycled_list());
1587  wr_it_of_next_word = word_res_it;
1588  word_res_it.forward();
1589  } else {
1590  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1591  WERD_RES_IT wr_it(&row_res->word_res_list);
1592  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1593  if (!wr_it.data()->part_of_combo) {
1594  if (prev_row_res == row_res) {
1595  prev_word_res = word_res;
1596  }
1597  word_res = wr_it.data();
1598  }
1599  }
1600  }
1601 }

◆ restart_page()

WERD_RES* tesseract::PAGE_RES_IT::restart_page ( )
inline

Definition at line 710 of file pageres.h.

710  {
711  return start_page(false); // Skip empty blocks.
712  }
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1548

◆ restart_page_with_empties()

WERD_RES* tesseract::PAGE_RES_IT::restart_page_with_empties ( )
inline

Definition at line 713 of file pageres.h.

713  {
714  return start_page(true); // Allow empty blocks.
715  }

◆ restart_row()

WERD_RES * tesseract::PAGE_RES_IT::restart_row ( )

Definition at line 1683 of file pageres.cpp.

1683  {
1684  ROW_RES *row = this->row();
1685  if (!row) {
1686  return nullptr;
1687  }
1688  for (restart_page(); this->row() != row; forward()) {
1689  // pass
1690  }
1691  return word();
1692 }
WERD_RES * forward()
Definition: pageres.h:743

◆ row()

ROW_RES* tesseract::PAGE_RES_IT::row ( ) const
inline

Definition at line 766 of file pageres.h.

766  { // row of current word
767  return row_res;
768  }

◆ start_page()

WERD_RES * tesseract::PAGE_RES_IT::start_page ( bool  empty_ok)

Definition at line 1548 of file pageres.cpp.

1548  {
1549  block_res_it.set_to_list(&page_res->block_res_list);
1550  block_res_it.mark_cycle_pt();
1551  prev_block_res = nullptr;
1552  prev_row_res = nullptr;
1553  prev_word_res = nullptr;
1554  block_res = nullptr;
1555  row_res = nullptr;
1556  word_res = nullptr;
1557  next_block_res = nullptr;
1558  next_row_res = nullptr;
1559  next_word_res = nullptr;
1560  internal_forward(true, empty_ok);
1561  return internal_forward(false, empty_ok);
1562 }

◆ word()

WERD_RES* tesseract::PAGE_RES_IT::word ( ) const
inline

Definition at line 763 of file pageres.h.

763  { // current word
764  return word_res;
765  }

Member Data Documentation

◆ page_res

PAGE_RES* tesseract::PAGE_RES_IT::page_res

Definition at line 684 of file pageres.h.


The documentation for this class was generated from the following files: