tesseract  5.0.0
altorenderer.cpp
Go to the documentation of this file.
1 // File: altorenderer.cpp
2 // Description: ALTO rendering interface
3 // Author: Jake Sebright
4 
5 // (C) Copyright 2018
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifdef _WIN32
17 # include "host.h" // windows.h for MultiByteToWideChar, ...
18 #endif
19 
20 #include <tesseract/baseapi.h>
21 #include <tesseract/renderer.h>
22 
23 #include <memory>
24 #include <sstream> // for std::stringstream
25 
26 namespace tesseract {
27 
31 static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
32  std::stringstream &alto_str) {
33  int left, top, right, bottom;
34  it->BoundingBox(level, &left, &top, &right, &bottom);
35 
36  int hpos = left;
37  int vpos = top;
38  int height = bottom - top;
39  int width = right - left;
40 
41  alto_str << " HPOS=\"" << hpos << "\"";
42  alto_str << " VPOS=\"" << vpos << "\"";
43  alto_str << " WIDTH=\"" << width << "\"";
44  alto_str << " HEIGHT=\"" << height << "\"";
45 
46  if (level == RIL_WORD) {
47  int wc = it->Confidence(RIL_WORD);
48  alto_str << " WC=\"0." << wc << "\"";
49  } else {
50  alto_str << ">";
51  }
52 }
53 
58  // Delay the XML output because we need the name of the image file.
59  begin_document = true;
60  return true;
61 }
62 
67  if (begin_document) {
69  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
70  "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
71  "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
72  "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
73  "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
74  "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
75  "\t<Description>\n"
76  "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
77  "\t\t<sourceImageInformation>\n"
78  "\t\t\t<fileName>");
79 
80  AppendString(api->GetInputName());
81 
83  "</fileName>\n"
84  "\t\t</sourceImageInformation>\n"
85  "\t\t<OCRProcessing ID=\"OCR_0\">\n"
86  "\t\t\t<ocrProcessingStep>\n"
87  "\t\t\t\t<processingSoftware>\n"
88  "\t\t\t\t\t<softwareName>tesseract ");
91  "</softwareName>\n"
92  "\t\t\t\t</processingSoftware>\n"
93  "\t\t\t</ocrProcessingStep>\n"
94  "\t\t</OCRProcessing>\n"
95  "\t</Description>\n"
96  "\t<Layout>\n");
97  begin_document = false;
98  }
99 
100  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
101  if (text == nullptr) {
102  return false;
103  }
104 
105  AppendString(text.get());
106 
107  return true;
108 }
109 
114  AppendString("\t</Layout>\n</alto>\n");
115 
116  return true;
117 }
118 
119 TessAltoRenderer::TessAltoRenderer(const char *outputbase)
120  : TessResultRenderer(outputbase, "xml"),
121  begin_document(false) {}
122 
127 char *TessBaseAPI::GetAltoText(int page_number) {
128  return GetAltoText(nullptr, page_number);
129 }
130 
135 char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
136  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
137  return nullptr;
138  }
139 
140  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
141 
142  if (input_file_.empty()) {
143  SetInputName(nullptr);
144  }
145 
146 #ifdef _WIN32
147  // convert input name from ANSI encoding to utf-8
148  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
149  wchar_t *uni16_str = new WCHAR[str16_len];
150  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
151  int utf8_len =
152  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
153  char *utf8_str = new char[utf8_len];
154  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
155  input_file_ = utf8_str;
156  delete[] uni16_str;
157  delete[] utf8_str;
158 #endif
159 
160  std::stringstream alto_str;
161  // Use "C" locale (needed for int values larger than 999).
162  alto_str.imbue(std::locale::classic());
163  alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
164  << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
165  << " ID=\"page_" << page_number << "\">\n"
166  << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
167  << " WIDTH=\"" << rect_width_ << "\""
168  << " HEIGHT=\"" << rect_height_ << "\">\n";
169 
170  ResultIterator *res_it = GetIterator();
171  while (!res_it->Empty(RIL_BLOCK)) {
172  if (res_it->Empty(RIL_WORD)) {
173  res_it->Next(RIL_WORD);
174  continue;
175  }
176 
177  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
178  alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
179  AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
180  alto_str << "\n";
181  }
182 
183  if (res_it->IsAtBeginningOf(RIL_PARA)) {
184  alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
185  AddBoxToAlto(res_it, RIL_PARA, alto_str);
186  alto_str << "\n";
187  }
188 
189  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
190  alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
191  AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
192  alto_str << "\n";
193  }
194 
195  alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
196  AddBoxToAlto(res_it, RIL_WORD, alto_str);
197  alto_str << " CONTENT=\"";
198 
199  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
200  bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
201  bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
202 
203  int left, top, right, bottom;
204  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
205 
206  do {
207  const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
208  if (grapheme && grapheme[0] != 0) {
209  alto_str << HOcrEscape(grapheme.get()).c_str();
210  }
211  res_it->Next(RIL_SYMBOL);
212  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
213 
214  alto_str << "\"/>";
215 
216  wcnt++;
217 
218  if (last_word_in_line) {
219  alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
220  lcnt++;
221  } else {
222  int hpos = right;
223  int vpos = top;
224  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
225  int width = left - hpos;
226  alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
227  << "\"/>\n";
228  }
229 
230  if (last_word_in_tblock) {
231  alto_str << "\t\t\t\t\t</TextBlock>\n";
232  tcnt++;
233  }
234 
235  if (last_word_in_cblock) {
236  alto_str << "\t\t\t\t</ComposedBlock>\n";
237  bcnt++;
238  }
239  }
240 
241  alto_str << "\t\t\t</PrintSpace>\n"
242  << "\t\t</Page>\n";
243  const std::string &text = alto_str.str();
244 
245  char *result = new char[text.length() + 1];
246  strcpy(result, text.c_str());
247  delete res_it;
248  return result;
249 }
250 
251 } // namespace tesseract
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2338
const char * GetInputName()
Definition: baseapi.cpp:925
std::string input_file_
Name used by training code.
Definition: baseapi.h:775
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:831
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:774
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:767
static const char * Version()
Definition: baseapi.cpp:238
ResultIterator * GetIterator()
Definition: baseapi.cpp:1313
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:267
bool Empty(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
Definition: renderer.cpp:111
bool BeginDocumentHandler() override
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override