tesseract  5.0.0
shapetable.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.cpp
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "shapetable.h"
23 
24 #include "bitvector.h"
25 #include "fontinfo.h"
26 #include "intfeaturespace.h"
27 #include "unicharset.h"
28 #include "unicity_table.h"
29 
30 #include <algorithm>
31 
32 namespace tesseract {
33 
34 // Helper function to get the index of the first result with the required
35 // unichar_id. If the results are sorted by rating, this will also be the
36 // best result with the required unichar_id.
37 // Returns -1 if the unichar_id is not found
38 int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
39  const ShapeTable &shape_table, UNICHAR_ID unichar_id) {
40  for (unsigned r = 0; r < results.size(); ++r) {
41  const auto shape_id = results[r].shape_id;
42  const Shape &shape = shape_table.GetShape(shape_id);
43  if (shape.ContainsUnichar(unichar_id)) {
44  return r;
45  }
46  }
47  return -1;
48 }
49 
50 // Helper function to get the index of the first result with the required
51 // unichar_id. If the results are sorted by rating, this will also be the
52 // best result with the required unichar_id.
53 // Returns -1 if the unichar_id is not found
54 int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,
55  UNICHAR_ID unichar_id) {
56  for (unsigned r = 0; r < results.size(); ++r) {
57  if (results[r].unichar_id == unichar_id) {
58  return r;
59  }
60  }
61  return -1;
62 }
63 
64 // Writes to the given file. Returns false in case of error.
65 bool UnicharAndFonts::Serialize(FILE *fp) const {
67 }
68 
69 // Reads from the given file. Returns false in case of error.
71  return fp->DeSerialize(&unichar_id) && fp->DeSerialize(font_ids);
72 }
73 
74 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
75 int UnicharAndFonts::SortByUnicharId(const void *v1, const void *v2) {
76  const auto *p1 = static_cast<const UnicharAndFonts *>(v1);
77  const auto *p2 = static_cast<const UnicharAndFonts *>(v2);
78  return p1->unichar_id - p2->unichar_id;
79 }
80 
82  return v1.unichar_id < v2.unichar_id;
83 }
84 
85 // Writes to the given file. Returns false in case of error.
86 bool Shape::Serialize(FILE *fp) const {
87  uint8_t sorted = unichars_sorted_;
88  return tesseract::Serialize(fp, &sorted) && tesseract::Serialize(fp, unichars_);
89 }
90 // Reads from the given file. Returns false in case of error.
91 
93  uint8_t sorted;
94  if (!fp->DeSerialize(&sorted)) {
95  return false;
96  }
97  unichars_sorted_ = sorted != 0;
98  return fp->DeSerialize(unichars_);
99 }
100 
101 // Adds a font_id for the given unichar_id. If the unichar_id is not
102 // in the shape, it is added.
103 void Shape::AddToShape(int unichar_id, int font_id) {
104  for (auto &unichar : unichars_) {
105  if (unichar.unichar_id == unichar_id) {
106  // Found the unichar in the shape table.
107  std::vector<int> &font_list = unichar.font_ids;
108  for (int f : font_list) {
109  if (f == font_id) {
110  return; // Font is already there.
111  }
112  }
113  font_list.push_back(font_id);
114  return;
115  }
116  }
117  // Unichar_id is not in shape, so add it to shape.
118  unichars_.emplace_back(unichar_id, font_id);
119  unichars_sorted_ = unichars_.size() <= 1;
120 }
121 
122 // Adds everything in other to this.
123 void Shape::AddShape(const Shape &other) {
124  for (const auto &unichar : other.unichars_) {
125  for (unsigned f = 0; f < unichar.font_ids.size(); ++f) {
126  AddToShape(unichar.unichar_id, unichar.font_ids[f]);
127  }
128  }
129  unichars_sorted_ = unichars_.size() <= 1;
130 }
131 
132 // Returns true if the shape contains the given unichar_id, font_id pair.
133 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
134  for (const auto &unichar : unichars_) {
135  if (unichar.unichar_id == unichar_id) {
136  // Found the unichar, so look for the font.
137  auto &font_list = unichar.font_ids;
138  for (int f : font_list) {
139  if (f == font_id) {
140  return true;
141  }
142  }
143  return false;
144  }
145  }
146  return false;
147 }
148 
149 // Returns true if the shape contains the given unichar_id, ignoring font.
150 bool Shape::ContainsUnichar(int unichar_id) const {
151  for (const auto &unichar : unichars_) {
152  if (unichar.unichar_id == unichar_id) {
153  return true;
154  }
155  }
156  return false;
157 }
158 
159 // Returns true if the shape contains the given font, ignoring unichar_id.
160 bool Shape::ContainsFont(int font_id) const {
161  for (const auto &unichar : unichars_) {
162  auto &font_list = unichar.font_ids;
163  for (int f : font_list) {
164  if (f == font_id) {
165  return true;
166  }
167  }
168  }
169  return false;
170 }
171 // Returns true if the shape contains the given font properties, ignoring
172 // unichar_id.
173 bool Shape::ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const {
174  for (const auto &unichar : unichars_) {
175  auto &font_list = unichar.font_ids;
176  for (int f : font_list) {
177  if (font_table.at(f).properties == properties) {
178  return true;
179  }
180  }
181  }
182  return false;
183 }
184 // Returns true if the shape contains multiple different font properties,
185 // ignoring unichar_id.
187  uint32_t properties = font_table.at(unichars_[0].font_ids[0]).properties;
188  for (const auto &unichar : unichars_) {
189  auto &font_list = unichar.font_ids;
190  for (int f : font_list) {
191  if (font_table.at(f).properties != properties) {
192  return true;
193  }
194  }
195  }
196  return false;
197 }
198 
199 // Returns true if this shape is equal to other (ignoring order of unichars
200 // and fonts).
201 bool Shape::operator==(const Shape &other) const {
202  return IsSubsetOf(other) && other.IsSubsetOf(*this);
203 }
204 
205 // Returns true if this is a subset (including equal) of other.
206 bool Shape::IsSubsetOf(const Shape &other) const {
207  for (const auto &unichar : unichars_) {
208  int unichar_id = unichar.unichar_id;
209  const std::vector<int> &font_list = unichar.font_ids;
210  for (int f : font_list) {
211  if (!other.ContainsUnicharAndFont(unichar_id, f)) {
212  return false;
213  }
214  }
215  }
216  return true;
217 }
218 
219 // Returns true if the lists of unichar ids are the same in this and other,
220 // ignoring fonts.
221 // NOT const, as it will sort the unichars on demand.
223  if (unichars_.size() != other->unichars_.size()) {
224  return false;
225  }
226  if (!unichars_sorted_) {
227  SortUnichars();
228  }
229  if (!other->unichars_sorted_) {
230  other->SortUnichars();
231  }
232  for (unsigned c = 0; c < unichars_.size(); ++c) {
233  if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {
234  return false;
235  }
236  }
237  return true;
238 }
239 
240 // Sorts the unichars_ vector by unichar.
241 void Shape::SortUnichars() {
242  std::sort(unichars_.begin(), unichars_.end(), UnicharAndFonts::StdSortByUnicharId);
243  unichars_sorted_ = true;
244 }
245 
246 ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {}
247 ShapeTable::ShapeTable(const UNICHARSET &unicharset) : unicharset_(&unicharset), num_fonts_(0) {}
248 
249 // Writes to the given file. Returns false in case of error.
250 bool ShapeTable::Serialize(FILE *fp) const {
251  return tesseract::Serialize(fp, shape_table_);
252 }
253 // Reads from the given file. Returns false in case of error.
254 
256  if (!fp->DeSerialize(shape_table_)) {
257  return false;
258  }
259  num_fonts_ = 0;
260  return true;
261 }
262 
263 // Returns the number of fonts used in this ShapeTable, computing it if
264 // necessary.
265 int ShapeTable::NumFonts() const {
266  if (num_fonts_ <= 0) {
267  for (auto shape_id : shape_table_) {
268  const Shape &shape = *shape_id;
269  for (int c = 0; c < shape.size(); ++c) {
270  for (int font_id : shape[c].font_ids) {
271  if (font_id >= num_fonts_) {
272  num_fonts_ = font_id + 1;
273  }
274  }
275  }
276  }
277  }
278  return num_fonts_;
279 }
280 
281 // Re-indexes the class_ids in the shapetable according to the given map.
282 // Useful in conjunction with set_unicharset.
283 void ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) {
284  for (auto shape : shape_table_) {
285  for (int c = 0; c < shape->size(); ++c) {
286  shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
287  }
288  }
289 }
290 
291 // Returns a string listing the classes/fonts in a shape.
292 std::string ShapeTable::DebugStr(unsigned shape_id) const {
293  if (shape_id >= shape_table_.size()) {
294  return "INVALID_UNICHAR_ID";
295  }
296  const Shape &shape = GetShape(shape_id);
297  std::string result;
298  result += "Shape" + std::to_string(shape_id);
299  if (shape.size() > 100) {
300  result += " Num unichars=" + std::to_string(shape.size());
301  return result;
302  }
303  for (int c = 0; c < shape.size(); ++c) {
304  result += " c_id=" + std::to_string(shape[c].unichar_id);
305  result += "=";
306  result += unicharset_->id_to_unichar(shape[c].unichar_id);
307  if (shape.size() < 10) {
308  result += ", " + std::to_string(shape[c].font_ids.size());
309  result += " fonts =";
310  int num_fonts = shape[c].font_ids.size();
311  if (num_fonts > 10) {
312  result += " " + std::to_string(shape[c].font_ids[0]);
313  result += " ... " + std::to_string(shape[c].font_ids[num_fonts - 1]);
314  } else {
315  for (int f = 0; f < num_fonts; ++f) {
316  result += " " + std::to_string(shape[c].font_ids[f]);
317  }
318  }
319  }
320  }
321  return result;
322 }
323 
324 // Returns a debug string summarizing the table.
325 std::string ShapeTable::SummaryStr() const {
326  int max_unichars = 0;
327  int num_multi_shapes = 0;
328  int num_master_shapes = 0;
329  for (unsigned s = 0; s < shape_table_.size(); ++s) {
330  if (MasterDestinationIndex(s) != s) {
331  continue;
332  }
333  ++num_master_shapes;
334  int shape_size = GetShape(s).size();
335  if (shape_size > 1) {
336  ++num_multi_shapes;
337  }
338  if (shape_size > max_unichars) {
339  max_unichars = shape_size;
340  }
341  }
342  std::string result;
343  result += "Number of shapes = " + std::to_string(num_master_shapes);
344  result += " max unichars = " + std::to_string(max_unichars);
345  result += " number with multiple unichars = " + std::to_string(num_multi_shapes);
346  return result;
347 }
348 
349 // Adds a new shape starting with the given unichar_id and font_id.
350 // Returns the assigned index.
351 unsigned ShapeTable::AddShape(int unichar_id, int font_id) {
352  auto index = shape_table_.size();
353  auto *shape = new Shape;
354  shape->AddToShape(unichar_id, font_id);
355  shape_table_.push_back(shape);
356  num_fonts_ = std::max(num_fonts_, font_id + 1);
357  return index;
358 }
359 
360 // Adds a copy of the given shape unless it is already present.
361 // Returns the assigned index or index of existing shape if already present.
362 unsigned ShapeTable::AddShape(const Shape &other) {
363  unsigned index;
364  for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {
365  continue;
366  }
367  if (index == shape_table_.size()) {
368  auto *shape = new Shape(other);
369  shape_table_.push_back(shape);
370  }
371  num_fonts_ = 0;
372  return index;
373 }
374 
375 // Removes the shape given by the shape index.
376 void ShapeTable::DeleteShape(unsigned shape_id) {
377  delete shape_table_[shape_id];
378  shape_table_.erase(shape_table_.begin() + shape_id);
379 }
380 
381 // Adds a font_id to the given existing shape index for the given
382 // unichar_id. If the unichar_id is not in the shape, it is added.
383 void ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) {
384  Shape &shape = *shape_table_[shape_id];
385  shape.AddToShape(unichar_id, font_id);
386  num_fonts_ = std::max(num_fonts_, font_id + 1);
387 }
388 
389 // Adds the given shape to the existing shape with the given index.
390 void ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) {
391  Shape &shape = *shape_table_[shape_id];
392  shape.AddShape(other);
393  num_fonts_ = 0;
394 }
395 
396 // Returns the id of the shape that contains the given unichar and font.
397 // If not found, returns -1.
398 // If font_id < 0, the font_id is ignored and the first shape that matches
399 // the unichar_id is returned.
400 int ShapeTable::FindShape(int unichar_id, int font_id) const {
401  for (unsigned s = 0; s < shape_table_.size(); ++s) {
402  const Shape &shape = GetShape(s);
403  for (int c = 0; c < shape.size(); ++c) {
404  if (shape[c].unichar_id == unichar_id) {
405  if (font_id < 0) {
406  return s; // We don't care about the font.
407  }
408  for (int f : shape[c].font_ids) {
409  if (f == font_id) {
410  return s;
411  }
412  }
413  }
414  }
415  }
416  return -1;
417 }
418 
419 // Returns the first unichar_id and font_id in the given shape.
420 void ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const {
421  const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];
422  *unichar_id = unichar_and_fonts.unichar_id;
423  *font_id = unichar_and_fonts.font_ids[0];
424 }
425 
426 // Expands all the classes/fonts in the shape individually to build
427 // a ShapeTable.
428 int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {
429  BitVector shape_map(master_shapes.NumShapes());
430  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
431  for (unsigned f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
432  int c = shape[u_ind].unichar_id;
433  int f = shape[u_ind].font_ids[f_ind];
434  int master_id = master_shapes.FindShape(c, f);
435  if (master_id >= 0) {
436  shape_map.SetBit(master_id);
437  } else if (FindShape(c, f) < 0) {
438  AddShape(c, f);
439  }
440  }
441  }
442  int num_masters = 0;
443  for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) {
444  if (shape_map[s]) {
445  AddShape(master_shapes.GetShape(s));
446  ++num_masters;
447  }
448  }
449  return num_masters;
450 }
451 
452 // Returns true if the shapes are already merged.
453 bool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const {
454  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
455 }
456 
457 // Returns true if any shape contains multiple unichars.
459  auto num_shapes = NumShapes();
460  for (unsigned s1 = 0; s1 < num_shapes; ++s1) {
461  if (MasterDestinationIndex(s1) != s1) {
462  continue;
463  }
464  if (GetShape(s1).size() > 1) {
465  return true;
466  }
467  }
468  return false;
469 }
470 
471 // Returns the maximum number of unichars over all shapes.
473  int max_num_unichars = 0;
474  int num_shapes = NumShapes();
475  for (int s = 0; s < num_shapes; ++s) {
476  if (GetShape(s).size() > max_num_unichars) {
477  max_num_unichars = GetShape(s).size();
478  }
479  }
480  return max_num_unichars;
481 }
482 
483 // Merges shapes with a common unichar over the [start, end) interval.
484 // Assumes single unichar per shape.
485 void ShapeTable::ForceFontMerges(unsigned start, unsigned end) {
486  for (unsigned s1 = start; s1 < end; ++s1) {
487  if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
488  int unichar_id = GetShape(s1)[0].unichar_id;
489  for (auto s2 = s1 + 1; s2 < end; ++s2) {
490  if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
491  unichar_id == GetShape(s2)[0].unichar_id) {
492  MergeShapes(s1, s2);
493  }
494  }
495  }
496  }
497  ShapeTable compacted(*unicharset_);
498  compacted.AppendMasterShapes(*this, nullptr);
499  *this = compacted;
500 }
501 
502 // Returns the number of unichars in the master shape.
503 unsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const {
504  int master_id = MasterDestinationIndex(shape_id);
505  return GetShape(master_id).size();
506 }
507 
508 // Returns the sum of the font counts in the master shape.
509 int ShapeTable::MasterFontCount(unsigned shape_id) const {
510  int master_id = MasterDestinationIndex(shape_id);
511  const Shape &shape = GetShape(master_id);
512  int font_count = 0;
513  for (int c = 0; c < shape.size(); ++c) {
514  font_count += shape[c].font_ids.size();
515  }
516  return font_count;
517 }
518 
519 // Returns the number of unichars that would result from merging the shapes.
520 int ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const {
521  // Do it the easy way for now.
522  int master_id1 = MasterDestinationIndex(shape_id1);
523  int master_id2 = MasterDestinationIndex(shape_id2);
524  Shape combined_shape(*shape_table_[master_id1]);
525  combined_shape.AddShape(*shape_table_[master_id2]);
526  return combined_shape.size();
527 }
528 
529 // Merges two shape_ids, leaving shape_id2 marked as merged.
530 void ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) {
531  auto master_id1 = MasterDestinationIndex(shape_id1);
532  auto master_id2 = MasterDestinationIndex(shape_id2);
533  // Point master_id2 (and all merged shapes) to master_id1.
534  shape_table_[master_id2]->set_destination_index(master_id1);
535  // Add all the shapes of master_id2 to master_id1.
536  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
537 }
538 
539 // Swaps two shape_ids.
540 void ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) {
541  Shape *tmp = shape_table_[shape_id1];
542  shape_table_[shape_id1] = shape_table_[shape_id2];
543  shape_table_[shape_id2] = tmp;
544 }
545 
546 // Returns the destination of this shape, (if merged), taking into account
547 // the fact that the destination may itself have been merged.
548 unsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const {
549  auto dest_id = shape_table_[shape_id]->destination_index();
550  if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) {
551  return shape_id; // Is master already.
552  }
553  auto master_id = shape_table_[dest_id]->destination_index();
554  if (master_id == dest_id || master_id < 0) {
555  return dest_id; // Dest is the master and shape_id points to it.
556  }
557  master_id = MasterDestinationIndex(master_id);
558  return master_id;
559 }
560 
561 // Returns false if the unichars in neither shape is a subset of the other.
562 bool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const {
563  const Shape &shape1 = GetShape(shape_id1);
564  const Shape &shape2 = GetShape(shape_id2);
565  int c1, c2;
566  for (c1 = 0; c1 < shape1.size(); ++c1) {
567  int unichar_id1 = shape1[c1].unichar_id;
568  if (!shape2.ContainsUnichar(unichar_id1)) {
569  break;
570  }
571  }
572  for (c2 = 0; c2 < shape2.size(); ++c2) {
573  int unichar_id2 = shape2[c2].unichar_id;
574  if (!shape1.ContainsUnichar(unichar_id2)) {
575  break;
576  }
577  }
578  return c1 == shape1.size() || c2 == shape2.size();
579 }
580 
581 // Returns false if the unichars in neither shape is a subset of the other.
582 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const {
583  const Shape &merge1 = GetShape(merge_id1);
584  const Shape &merge2 = GetShape(merge_id2);
585  const Shape &shape = GetShape(shape_id);
586  int cm1, cm2, cs;
587  for (cs = 0; cs < shape.size(); ++cs) {
588  int unichar_id = shape[cs].unichar_id;
589  if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
590  break; // Shape is not a subset of the merge.
591  }
592  }
593  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
594  int unichar_id1 = merge1[cm1].unichar_id;
595  if (!shape.ContainsUnichar(unichar_id1)) {
596  break; // Merge is not a subset of shape
597  }
598  }
599  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
600  int unichar_id2 = merge2[cm2].unichar_id;
601  if (!shape.ContainsUnichar(unichar_id2)) {
602  break; // Merge is not a subset of shape
603  }
604  }
605  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
606 }
607 
608 // Returns true if the unichar sets are equal between the shapes.
609 bool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const {
610  const Shape &shape1 = GetShape(shape_id1);
611  const Shape &shape2 = GetShape(shape_id2);
612  for (int c1 = 0; c1 < shape1.size(); ++c1) {
613  int unichar_id1 = shape1[c1].unichar_id;
614  if (!shape2.ContainsUnichar(unichar_id1)) {
615  return false;
616  }
617  }
618  for (int c2 = 0; c2 < shape2.size(); ++c2) {
619  int unichar_id2 = shape2[c2].unichar_id;
620  if (!shape1.ContainsUnichar(unichar_id2)) {
621  return false;
622  }
623  }
624  return true;
625 }
626 
627 // Returns true if the unichar sets are equal between the shapes.
628 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const {
629  const Shape &merge1 = GetShape(merge_id1);
630  const Shape &merge2 = GetShape(merge_id2);
631  const Shape &shape = GetShape(shape_id);
632  for (int cs = 0; cs < shape.size(); ++cs) {
633  int unichar_id = shape[cs].unichar_id;
634  if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
635  return false; // Shape has a unichar that appears in neither merge.
636  }
637  }
638  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
639  int unichar_id1 = merge1[cm1].unichar_id;
640  if (!shape.ContainsUnichar(unichar_id1)) {
641  return false; // Merge has a unichar that is not in shape.
642  }
643  }
644  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
645  int unichar_id2 = merge2[cm2].unichar_id;
646  if (!shape.ContainsUnichar(unichar_id2)) {
647  return false; // Merge has a unichar that is not in shape.
648  }
649  }
650  return true;
651 }
652 
653 // Returns true if there is a common unichar between the shapes.
654 bool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const {
655  const Shape &shape1 = GetShape(shape_id1);
656  const Shape &shape2 = GetShape(shape_id2);
657  for (int c1 = 0; c1 < shape1.size(); ++c1) {
658  int unichar_id1 = shape1[c1].unichar_id;
659  if (shape2.ContainsUnichar(unichar_id1)) {
660  return true;
661  }
662  }
663  return false;
664 }
665 
666 // Returns true if there is a common font id between the shapes.
667 bool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const {
668  const Shape &shape1 = GetShape(shape_id1);
669  const Shape &shape2 = GetShape(shape_id2);
670  for (int c1 = 0; c1 < shape1.size(); ++c1) {
671  const std::vector<int> &font_list1 = shape1[c1].font_ids;
672  for (int f : font_list1) {
673  if (shape2.ContainsFont(f)) {
674  return true;
675  }
676  }
677  }
678  return false;
679 }
680 
681 // Appends the master shapes from other to this.
682 // If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
683 void ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map) {
684  if (shape_map != nullptr) {
685  shape_map->clear();
686  shape_map->resize(other.NumShapes(), -1);
687  }
688  for (unsigned s = 0; s < other.shape_table_.size(); ++s) {
689  if (other.shape_table_[s]->destination_index() < 0) {
690  int index = AddShape(*other.shape_table_[s]);
691  if (shape_map != nullptr) {
692  (*shape_map)[s] = index;
693  }
694  }
695  }
696 }
697 
698 // Returns the number of master shapes remaining after merging.
700  int num_shapes = 0;
701  for (auto s : shape_table_) {
702  if (s->destination_index() < 0) {
703  ++num_shapes;
704  }
705  }
706  return num_shapes;
707 }
708 
709 // Adds the unichars of the given shape_id to the vector of results. Any
710 // unichar_id that is already present just has the fonts added to the
711 // font set for that result without adding a new entry in the vector.
712 // NOTE: it is assumed that the results are given to this function in order
713 // of decreasing rating.
714 // The unichar_map vector indicates the index of the results entry containing
715 // each unichar, or -1 if the unichar is not yet included in results.
716 void ShapeTable::AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
717  std::vector<UnicharRating> *results) const {
718  if (shape_rating.joined) {
719  AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results);
720  }
721  if (shape_rating.broken) {
722  AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results);
723  }
724  const Shape &shape = GetShape(shape_rating.shape_id);
725  for (int u = 0; u < shape.size(); ++u) {
726  int result_index =
727  AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results);
728  for (int font_id : shape[u].font_ids) {
729  (*results)[result_index].fonts.emplace_back(font_id,
730  IntCastRounded(shape_rating.rating * INT16_MAX));
731  }
732  }
733 }
734 
735 // Adds the given unichar_id to the results if needed, updating unichar_map
736 // and returning the index of unichar in results.
737 int ShapeTable::AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
738  std::vector<UnicharRating> *results) const {
739  int result_index = unichar_map->at(unichar_id);
740  if (result_index < 0) {
741  UnicharRating result(unichar_id, rating);
742  result_index = results->size();
743  results->push_back(result);
744  (*unichar_map)[unichar_id] = result_index;
745  }
746  return result_index;
747 }
748 
749 } // namespace tesseract
int IntCastRounded(double x)
Definition: helpers.h:175
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
int UNICHAR_ID
Definition: unichar.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ UNICHAR_JOINED
Definition: unicharset.h:37
uint32_t properties
Definition: fontinfo.h:135
void SetBit(int index)
Definition: bitvector.h:78
T & at(int index) const
Definition: genericvector.h:93
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
static int FirstResultWithUnichar(const std::vector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:54
static int FirstResultWithUnichar(const std::vector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2)
Definition: shapetable.cpp:81
std::vector< int32_t > font_ids
Definition: shapetable.h:144
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:75
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:70
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:65
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:206
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:186
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:133
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:160
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:103
int size() const
Definition: shapetable.h:169
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:86
void AddShape(const Shape &other)
Definition: shapetable.cpp:123
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:222
bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const
Definition: shapetable.cpp:173
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:92
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:150
bool operator==(const Shape &other) const
Definition: shapetable.cpp:201
void SwapShapes(unsigned shape_id1, unsigned shape_id2)
Definition: shapetable.cpp:540
void AddToShape(unsigned shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:383
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:458
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:520
bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const
Definition: shapetable.cpp:628
void ReMapClassIds(const std::vector< int > &unicharset_map)
Definition: shapetable.cpp:283
bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const
Definition: shapetable.cpp:582
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292
unsigned AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:351
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
int NumMasterShapes() const
Definition: shapetable.cpp:699
std::string SummaryStr() const
Definition: shapetable.cpp:325
unsigned MasterDestinationIndex(unsigned shape_id) const
Definition: shapetable.cpp:548
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
int MasterFontCount(unsigned shape_id) const
Definition: shapetable.cpp:509
unsigned NumShapes() const
Definition: shapetable.h:248
void DeleteShape(unsigned shape_id)
Definition: shapetable.cpp:376
bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:453
void AddShapeToShape(unsigned shape_id, const Shape &other)
Definition: shapetable.cpp:390
void AddShapeToResults(const ShapeRating &shape_rating, std::vector< int > *unichar_map, std::vector< UnicharRating > *results) const
Definition: shapetable.cpp:716
int MaxNumUnichars() const
Definition: shapetable.cpp:472
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:428
bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:609
void MergeShapes(unsigned shape_id1, unsigned shape_id2)
Definition: shapetable.cpp:530
unsigned MasterUnicharCount(unsigned shape_id) const
Definition: shapetable.cpp:503
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:420
bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:562
bool CommonFont(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:667
void ForceFontMerges(unsigned start, unsigned end)
Definition: shapetable.cpp:485
bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:654
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:400
void AppendMasterShapes(const ShapeTable &other, std::vector< int > *shape_map)
Definition: shapetable.cpp:683