40 it->Orientation(&orientation, &writing_direction, &textline_order,
53 static void AddBaselineCoordsTohOCR(
const PageIterator *it,
55 std::stringstream &hocr_str) {
58 hocr_str <<
"; textangle " << 360 - orientation * 90;
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
84 double p1 = (y2 - y1) /
static_cast<double>(x2 - x1);
85 double p0 = y1 - p1 * x1;
87 hocr_str <<
"; baseline " << round(p1 * 1000.0) / 1000.0 <<
" "
88 << round(p0 * 1000.0) / 1000.0;
92 std::stringstream &hocr_str) {
93 int left, top, right, bottom;
94 it->BoundingBox(level, &left, &top, &right, &bottom);
97 hocr_str <<
" title=\"bbox " << left <<
" " << top <<
" " << right <<
" "
101 AddBaselineCoordsTohOCR(it, level, hocr_str);
103 float row_height, descenders, ascenders;
104 it->RowAttributes(&row_height, &descenders, &ascenders);
106 hocr_str <<
"; x_size " << row_height <<
"; x_descenders " << -descenders
107 <<
"; x_ascenders " << ascenders;
140 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
141 int page_id = page_number + 1;
142 bool para_is_ltr =
true;
143 const char *paragraph_lang =
nullptr;
144 bool font_info =
false;
145 bool hocr_boxes =
false;
156 MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1,
nullptr, 0);
157 wchar_t *uni16_str =
new WCHAR[str16_len];
158 str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1, uni16_str,
160 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
161 0,
nullptr,
nullptr);
162 char *utf8_str =
new char[utf8_len];
163 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
170 std::stringstream hocr_str;
172 hocr_str.imbue(std::locale::classic());
174 hocr_str.precision(8);
175 hocr_str <<
" <div class='ocr_page'"
177 <<
"page_" << page_id <<
"'"
178 <<
" title='image \"";
182 hocr_str <<
"unknown";
190 std::unique_ptr<ResultIterator> res_it(
GetIterator());
198 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
200 hocr_str <<
" <div class='ocr_carea'"
202 <<
"block_" << page_id <<
"_" << bcnt <<
"'";
203 AddBoxTohOCR(res_it.get(),
RIL_BLOCK, hocr_str);
205 if (res_it->IsAtBeginningOf(
RIL_PARA)) {
206 hocr_str <<
"\n <p class='ocr_par'";
207 para_is_ltr = res_it->ParagraphIsLtr();
209 hocr_str <<
" dir='rtl'";
212 <<
"par_" << page_id <<
"_" << pcnt <<
"'";
213 paragraph_lang = res_it->WordRecognitionLanguage();
214 if (paragraph_lang) {
215 hocr_str <<
" lang='" << paragraph_lang <<
"'";
217 AddBoxTohOCR(res_it.get(),
RIL_PARA, hocr_str);
220 hocr_str <<
"\n <span class='";
221 switch (res_it->BlockType()) {
223 hocr_str <<
"ocr_header";
226 hocr_str <<
"ocr_textfloat";
229 hocr_str <<
"ocr_caption";
232 hocr_str <<
"ocr_line";
235 <<
"line_" << page_id <<
"_" << lcnt <<
"'";
240 int32_t lstm_choice_mode =
tesseract_->lstm_choice_mode;
241 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
242 *rawTimestepMap =
nullptr;
243 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap =
nullptr;
244 if (lstm_choice_mode) {
245 CTCMap = res_it->GetBestLSTMSymbolChoices();
246 rawTimestepMap = res_it->GetRawLSTMTimesteps();
248 hocr_str <<
"\n <span class='ocrx_word'"
250 <<
"word_" << page_id <<
"_" << wcnt <<
"'";
251 int left, top, right, bottom;
252 bool bold, italic, underlined, monospace, serif, smallcaps;
253 int pointsize, font_id;
254 const char *font_name;
255 res_it->BoundingBox(
RIL_WORD, &left, &top, &right, &bottom);
257 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
258 &serif, &smallcaps, &pointsize, &font_id);
259 hocr_str <<
" title='bbox " << left <<
" " << top <<
" " << right <<
" "
260 << bottom <<
"; x_wconf "
261 <<
static_cast<int>(res_it->Confidence(
RIL_WORD));
264 hocr_str <<
"; x_font " <<
HOcrEscape(font_name).c_str();
266 hocr_str <<
"; x_fsize " << pointsize;
269 const char *lang = res_it->WordRecognitionLanguage();
270 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
271 hocr_str <<
" lang='" << lang <<
"'";
273 switch (res_it->WordDirection()) {
277 hocr_str <<
" dir='ltr'";
282 hocr_str <<
" dir='rtl'";
295 hocr_str <<
"<strong>";
301 const std::unique_ptr<const char[]> grapheme(
303 if (grapheme && grapheme[0] != 0) {
305 res_it->BoundingBox(
RIL_SYMBOL, &left, &top, &right, &bottom);
306 hocr_str <<
"\n <span class='ocrx_cinfo' title='x_bboxes "
307 << left <<
" " << top <<
" " << right <<
" " << bottom
308 <<
"; x_conf " << res_it->Confidence(
RIL_SYMBOL) <<
"'>";
310 hocr_str <<
HOcrEscape(grapheme.get()).c_str();
312 hocr_str <<
"</span>";
314 if (lstm_choice_mode == 1 && ci.
Timesteps() !=
nullptr) {
315 std::vector<std::vector<std::pair<const char *, float>>> *symbol =
317 hocr_str <<
"\n <span class='ocr_symbol'"
319 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt
321 for (
const auto ×tep : *symbol) {
322 hocr_str <<
"\n <span class='ocrx_cinfo'"
324 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
326 for (
auto conf : timestep) {
327 hocr_str <<
"\n <span class='ocrx_cinfo'"
329 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
331 <<
" title='x_confs " << int(conf.second * 100) <<
"'>"
332 <<
HOcrEscape(conf.first).c_str() <<
"</span>";
335 hocr_str <<
"</span>";
338 hocr_str <<
"\n </span>";
340 }
else if (lstm_choice_mode == 2) {
341 hocr_str <<
"\n <span class='ocrx_cinfo'"
343 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
348 if (choice !=
nullptr) {
349 hocr_str <<
"\n <span class='ocrx_cinfo'"
351 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
353 <<
" title='x_confs " << choiceconf <<
"'>"
358 hocr_str <<
"\n </span>";
369 hocr_str <<
"</strong>";
372 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap !=
nullptr) {
373 for (
const auto &symbol : *rawTimestepMap) {
374 hocr_str <<
"\n <span class='ocr_symbol'"
376 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt <<
"'>";
377 for (
const auto ×tep : symbol) {
378 hocr_str <<
"\n <span class='ocrx_cinfo'"
380 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
382 for (
auto conf : timestep) {
383 hocr_str <<
"\n <span class='ocrx_cinfo'"
385 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
387 <<
" title='x_confs " << int(conf.second * 100) <<
"'>"
388 <<
HOcrEscape(conf.first).c_str() <<
"</span>";
391 hocr_str <<
"</span>";
394 hocr_str <<
"</span>";
397 }
else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap !=
nullptr) {
398 for (
const auto ×tep : *CTCMap) {
399 if (timestep.size() > 0) {
400 hocr_str <<
"\n <span class='ocrx_cinfo'"
402 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
404 for (
auto &j : timestep) {
405 float conf = 100 -
tesseract_->lstm_rating_coefficient * j.second;
412 hocr_str <<
"\n <span class='ocrx_cinfo'"
414 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
416 <<
" title='x_confs " << conf <<
"'>"
420 hocr_str <<
"</span>";
426 if (hocr_boxes || lstm_choice_mode > 0) {
429 hocr_str <<
"</span>";
434 if (last_word_in_line) {
435 hocr_str <<
"\n </span>";
438 if (last_word_in_para) {
439 hocr_str <<
"\n </p>\n";
443 if (last_word_in_block) {
444 hocr_str <<
" </div>\n";
448 hocr_str <<
" </div>\n";
450 const std::string &text = hocr_str.str();
451 char *result =
new char[text.length() + 1];
452 strcpy(result, text.c_str());
466 font_info_ = font_info;
471 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
472 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
473 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
474 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
475 "lang=\"en\">\n <head>\n <title>");
479 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
480 "charset=utf-8\"/>\n"
483 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
484 " ocr_line ocrx_word ocrp_wconf");
486 AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
504 if (hocr ==
nullptr) {
#define TESSERACT_VERSION_STR
std::string HOcrEscape(const char *text)
std::string input_file_
Name used by training code.
int Recognize(ETEXT_DESC *monitor)
PAGE_RES * page_res_
The page-level data.
Tesseract * tesseract_
The underlying data object.
int GetSourceYResolution()
ResultIterator * GetIterator()
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
bool GetBoolVariable(const char *name, bool *value) const
const char * GetUTF8Text() const
std::vector< std::vector< std::pair< const char *, float > > > * Timesteps() const
const char * title() const
void AppendString(const char *s)
bool AddImageHandler(TessBaseAPI *api) override
TessHOcrRenderer(const char *outputbase, bool font_info)
bool BeginDocumentHandler() override
bool EndDocumentHandler() override