tesseract  5.0.0
hocrrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: hocrrenderer.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith (original code from baseapi.cpp)
5  * Author: Stefan Weil (moved to separate file and cleaned code)
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <tesseract/baseapi.h> // for TessBaseAPI
21 #include <locale> // for std::locale::classic
22 #include <memory> // for std::unique_ptr
23 #include <sstream> // for std::stringstream
24 #ifdef _WIN32
25 # include "host.h" // windows.h for MultiByteToWideChar, ...
26 #endif
27 #include <tesseract/renderer.h>
28 #include "tesseractclass.h" // for Tesseract
29 
30 namespace tesseract {
31 
35 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
36  tesseract::Orientation orientation;
37  tesseract::WritingDirection writing_direction;
38  tesseract::TextlineOrder textline_order;
39  float deskew_angle;
40  it->Orientation(&orientation, &writing_direction, &textline_order,
41  &deskew_angle);
42  return orientation;
43 }
44 
53 static void AddBaselineCoordsTohOCR(const PageIterator *it,
54  PageIteratorLevel level,
55  std::stringstream &hocr_str) {
56  tesseract::Orientation orientation = GetBlockTextOrientation(it);
57  if (orientation != ORIENTATION_PAGE_UP) {
58  hocr_str << "; textangle " << 360 - orientation * 90;
59  return;
60  }
61 
62  int left, top, right, bottom;
63  it->BoundingBox(level, &left, &top, &right, &bottom);
64 
65  // Try to get the baseline coordinates at this level.
66  int x1, y1, x2, y2;
67  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
68  return;
69  }
70  // Following the description of this field of the hOCR spec, we convert the
71  // baseline coordinates so that "the bottom left of the bounding box is the
72  // origin".
73  x1 -= left;
74  x2 -= left;
75  y1 -= bottom;
76  y2 -= bottom;
77 
78  // Now fit a line through the points so we can extract coefficients for the
79  // equation: y = p1 x + p0
80  if (x1 == x2) {
81  // Problem computing the polynomial coefficients.
82  return;
83  }
84  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
85  double p0 = y1 - p1 * x1;
86 
87  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
88  << round(p0 * 1000.0) / 1000.0;
89 }
90 
91 static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
92  std::stringstream &hocr_str) {
93  int left, top, right, bottom;
94  it->BoundingBox(level, &left, &top, &right, &bottom);
95  // This is the only place we use double quotes instead of single quotes,
96  // but it may too late to change for consistency
97  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
98  << bottom;
99  // Add baseline coordinates & heights for textlines only.
100  if (level == RIL_TEXTLINE) {
101  AddBaselineCoordsTohOCR(it, level, hocr_str);
102  // add custom height measures
103  float row_height, descenders, ascenders; // row attributes
104  it->RowAttributes(&row_height, &descenders, &ascenders);
105  // TODO(rays): Do we want to limit these to a single decimal place?
106  hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
107  << "; x_ascenders " << ascenders;
108  }
109  hocr_str << "\">";
110 }
111 
121 char *TessBaseAPI::GetHOCRText(int page_number) {
122  return GetHOCRText(nullptr, page_number);
123 }
124 
134 char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
135  if (tesseract_ == nullptr ||
136  (page_res_ == nullptr && Recognize(monitor) < 0)) {
137  return nullptr;
138  }
139 
140  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
141  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
142  bool para_is_ltr = true; // Default direction is LTR
143  const char *paragraph_lang = nullptr;
144  bool font_info = false;
145  bool hocr_boxes = false;
146  GetBoolVariable("hocr_font_info", &font_info);
147  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
148 
149  if (input_file_.empty()) {
150  SetInputName(nullptr);
151  }
152 
153 #ifdef _WIN32
154  // convert input name from ANSI encoding to utf-8
155  int str16_len =
156  MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
157  wchar_t *uni16_str = new WCHAR[str16_len];
158  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
159  str16_len);
160  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
161  0, nullptr, nullptr);
162  char *utf8_str = new char[utf8_len];
163  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
164  nullptr, nullptr);
165  input_file_ = utf8_str;
166  delete[] uni16_str;
167  delete[] utf8_str;
168 #endif
169 
170  std::stringstream hocr_str;
171  // Use "C" locale (needed for double values x_size and x_descenders).
172  hocr_str.imbue(std::locale::classic());
173  // Use 8 digits for double values.
174  hocr_str.precision(8);
175  hocr_str << " <div class='ocr_page'"
176  << " id='"
177  << "page_" << page_id << "'"
178  << " title='image \"";
179  if (!input_file_.empty()) {
180  hocr_str << HOcrEscape(input_file_.c_str());
181  } else {
182  hocr_str << "unknown";
183  }
184 
185  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
186  << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
187  << "; scan_res " << GetSourceYResolution() << " "
188  << GetSourceYResolution() << "'>\n";
189 
190  std::unique_ptr<ResultIterator> res_it(GetIterator());
191  while (!res_it->Empty(RIL_BLOCK)) {
192  if (res_it->Empty(RIL_WORD)) {
193  res_it->Next(RIL_WORD);
194  continue;
195  }
196 
197  // Open any new block/paragraph/textline.
198  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
199  para_is_ltr = true; // reset to default direction
200  hocr_str << " <div class='ocr_carea'"
201  << " id='"
202  << "block_" << page_id << "_" << bcnt << "'";
203  AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
204  }
205  if (res_it->IsAtBeginningOf(RIL_PARA)) {
206  hocr_str << "\n <p class='ocr_par'";
207  para_is_ltr = res_it->ParagraphIsLtr();
208  if (!para_is_ltr) {
209  hocr_str << " dir='rtl'";
210  }
211  hocr_str << " id='"
212  << "par_" << page_id << "_" << pcnt << "'";
213  paragraph_lang = res_it->WordRecognitionLanguage();
214  if (paragraph_lang) {
215  hocr_str << " lang='" << paragraph_lang << "'";
216  }
217  AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
218  }
219  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
220  hocr_str << "\n <span class='";
221  switch (res_it->BlockType()) {
222  case PT_HEADING_TEXT:
223  hocr_str << "ocr_header";
224  break;
225  case PT_PULLOUT_TEXT:
226  hocr_str << "ocr_textfloat";
227  break;
228  case PT_CAPTION_TEXT:
229  hocr_str << "ocr_caption";
230  break;
231  default:
232  hocr_str << "ocr_line";
233  }
234  hocr_str << "' id='"
235  << "line_" << page_id << "_" << lcnt << "'";
236  AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
237  }
238 
239  // Now, process the word...
240  int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
241  std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
242  *rawTimestepMap = nullptr;
243  std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
244  if (lstm_choice_mode) {
245  CTCMap = res_it->GetBestLSTMSymbolChoices();
246  rawTimestepMap = res_it->GetRawLSTMTimesteps();
247  }
248  hocr_str << "\n <span class='ocrx_word'"
249  << " id='"
250  << "word_" << page_id << "_" << wcnt << "'";
251  int left, top, right, bottom;
252  bool bold, italic, underlined, monospace, serif, smallcaps;
253  int pointsize, font_id;
254  const char *font_name;
255  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
256  font_name =
257  res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
258  &serif, &smallcaps, &pointsize, &font_id);
259  hocr_str << " title='bbox " << left << " " << top << " " << right << " "
260  << bottom << "; x_wconf "
261  << static_cast<int>(res_it->Confidence(RIL_WORD));
262  if (font_info) {
263  if (font_name) {
264  hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
265  }
266  hocr_str << "; x_fsize " << pointsize;
267  }
268  hocr_str << "'";
269  const char *lang = res_it->WordRecognitionLanguage();
270  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
271  hocr_str << " lang='" << lang << "'";
272  }
273  switch (res_it->WordDirection()) {
274  // Only emit direction if different from current paragraph direction
275  case DIR_LEFT_TO_RIGHT:
276  if (!para_is_ltr) {
277  hocr_str << " dir='ltr'";
278  }
279  break;
280  case DIR_RIGHT_TO_LEFT:
281  if (para_is_ltr) {
282  hocr_str << " dir='rtl'";
283  }
284  break;
285  case DIR_MIX:
286  case DIR_NEUTRAL:
287  default: // Do nothing.
288  break;
289  }
290  hocr_str << ">";
291  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
292  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
293  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
294  if (bold) {
295  hocr_str << "<strong>";
296  }
297  if (italic) {
298  hocr_str << "<em>";
299  }
300  do {
301  const std::unique_ptr<const char[]> grapheme(
302  res_it->GetUTF8Text(RIL_SYMBOL));
303  if (grapheme && grapheme[0] != 0) {
304  if (hocr_boxes) {
305  res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
306  hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
307  << left << " " << top << " " << right << " " << bottom
308  << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
309  }
310  hocr_str << HOcrEscape(grapheme.get()).c_str();
311  if (hocr_boxes) {
312  hocr_str << "</span>";
313  tesseract::ChoiceIterator ci(*res_it);
314  if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
315  std::vector<std::vector<std::pair<const char *, float>>> *symbol =
316  ci.Timesteps();
317  hocr_str << "\n <span class='ocr_symbol'"
318  << " id='"
319  << "symbol_" << page_id << "_" << wcnt << "_" << scnt
320  << "'>";
321  for (const auto &timestep : *symbol) {
322  hocr_str << "\n <span class='ocrx_cinfo'"
323  << " id='"
324  << "timestep" << page_id << "_" << wcnt << "_" << tcnt
325  << "'>";
326  for (auto conf : timestep) {
327  hocr_str << "\n <span class='ocrx_cinfo'"
328  << " id='"
329  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
330  << "'"
331  << " title='x_confs " << int(conf.second * 100) << "'>"
332  << HOcrEscape(conf.first).c_str() << "</span>";
333  ++ccnt;
334  }
335  hocr_str << "</span>";
336  ++tcnt;
337  }
338  hocr_str << "\n </span>";
339  ++scnt;
340  } else if (lstm_choice_mode == 2) {
341  hocr_str << "\n <span class='ocrx_cinfo'"
342  << " id='"
343  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
344  << "'>";
345  do {
346  const char *choice = ci.GetUTF8Text();
347  float choiceconf = ci.Confidence();
348  if (choice != nullptr) {
349  hocr_str << "\n <span class='ocrx_cinfo'"
350  << " id='"
351  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
352  << "'"
353  << " title='x_confs " << choiceconf << "'>"
354  << HOcrEscape(choice).c_str() << "</span>";
355  ccnt++;
356  }
357  } while (ci.Next());
358  hocr_str << "\n </span>";
359  tcnt++;
360  }
361  }
362  }
363  res_it->Next(RIL_SYMBOL);
364  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
365  if (italic) {
366  hocr_str << "</em>";
367  }
368  if (bold) {
369  hocr_str << "</strong>";
370  }
371  // If the lstm choice mode is required it is added here
372  if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
373  for (const auto &symbol : *rawTimestepMap) {
374  hocr_str << "\n <span class='ocr_symbol'"
375  << " id='"
376  << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
377  for (const auto &timestep : symbol) {
378  hocr_str << "\n <span class='ocrx_cinfo'"
379  << " id='"
380  << "timestep" << page_id << "_" << wcnt << "_" << tcnt
381  << "'>";
382  for (auto conf : timestep) {
383  hocr_str << "\n <span class='ocrx_cinfo'"
384  << " id='"
385  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
386  << "'"
387  << " title='x_confs " << int(conf.second * 100) << "'>"
388  << HOcrEscape(conf.first).c_str() << "</span>";
389  ++ccnt;
390  }
391  hocr_str << "</span>";
392  ++tcnt;
393  }
394  hocr_str << "</span>";
395  ++scnt;
396  }
397  } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
398  for (const auto &timestep : *CTCMap) {
399  if (timestep.size() > 0) {
400  hocr_str << "\n <span class='ocrx_cinfo'"
401  << " id='"
402  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
403  << "'>";
404  for (auto &j : timestep) {
405  float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
406  if (conf < 0.0f) {
407  conf = 0.0f;
408  }
409  if (conf > 100.0f) {
410  conf = 100.0f;
411  }
412  hocr_str << "\n <span class='ocrx_cinfo'"
413  << " id='"
414  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
415  << "'"
416  << " title='x_confs " << conf << "'>"
417  << HOcrEscape(j.first).c_str() << "</span>";
418  ccnt++;
419  }
420  hocr_str << "</span>";
421  tcnt++;
422  }
423  }
424  }
425  // Close ocrx_word.
426  if (hocr_boxes || lstm_choice_mode > 0) {
427  hocr_str << "\n ";
428  }
429  hocr_str << "</span>";
430  tcnt = 1;
431  ccnt = 1;
432  wcnt++;
433  // Close any ending block/paragraph/textline.
434  if (last_word_in_line) {
435  hocr_str << "\n </span>";
436  lcnt++;
437  }
438  if (last_word_in_para) {
439  hocr_str << "\n </p>\n";
440  pcnt++;
441  para_is_ltr = true; // back to default direction
442  }
443  if (last_word_in_block) {
444  hocr_str << " </div>\n";
445  bcnt++;
446  }
447  }
448  hocr_str << " </div>\n";
449 
450  const std::string &text = hocr_str.str();
451  char *result = new char[text.length() + 1];
452  strcpy(result, text.c_str());
453  return result;
454 }
455 
456 /**********************************************************************
457  * HOcr Text Renderer interface implementation
458  **********************************************************************/
459 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
460  : TessResultRenderer(outputbase, "hocr") {
461  font_info_ = false;
462 }
463 
464 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
465  : TessResultRenderer(outputbase, "hocr") {
466  font_info_ = font_info;
467 }
468 
470  AppendString(
471  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
472  "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
473  " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
474  "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
475  "lang=\"en\">\n <head>\n <title>");
476  AppendString(title());
477  AppendString(
478  "</title>\n"
479  " <meta http-equiv=\"Content-Type\" content=\"text/html;"
480  "charset=utf-8\"/>\n"
481  " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
482  "' />\n"
483  " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
484  " ocr_line ocrx_word ocrp_wconf");
485  if (font_info_) {
486  AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
487  }
488  AppendString(
489  "'/>\n"
490  " </head>\n"
491  " <body>\n");
492 
493  return true;
494 }
495 
497  AppendString(" </body>\n</html>\n");
498 
499  return true;
500 }
501 
503  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
504  if (hocr == nullptr) {
505  return false;
506  }
507 
508  AppendString(hocr.get());
509 
510  return true;
511 }
512 
513 } // namespace tesseract
#define TESSERACT_VERSION_STR
Definition: version.h:32
@ ORIENTATION_PAGE_UP
Definition: publictypes.h:117
@ DIR_MIX
Definition: unichar.h:47
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
@ DIR_NEUTRAL
Definition: unichar.h:44
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2338
@ PT_CAPTION_TEXT
Definition: publictypes.h:62
@ PT_PULLOUT_TEXT
Definition: publictypes.h:57
@ PT_HEADING_TEXT
Definition: publictypes.h:56
std::string input_file_
Name used by training code.
Definition: baseapi.h:775
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:831
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:774
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:767
ResultIterator * GetIterator()
Definition: baseapi.cpp:1313
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:267
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:301
const char * GetUTF8Text() const
std::vector< std::vector< std::pair< const char *, float > > > * Timesteps() const
const char * title() const
Definition: renderer.h:88
void AppendString(const char *s)
Definition: renderer.cpp:111
bool AddImageHandler(TessBaseAPI *api) override
TessHOcrRenderer(const char *outputbase, bool font_info)
bool BeginDocumentHandler() override
bool EndDocumentHandler() override