tesseract  5.0.0
unicharset.h
Go to the documentation of this file.
1 // File: unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20 #define TESSERACT_CCUTIL_UNICHARSET_H_
21 
22 #include "errcode.h"
23 #include "unicharmap.h"
24 
25 #include <tesseract/unichar.h>
26 #include "helpers.h"
27 #include "serialis.h"
28 
29 #include <functional> // for std::function
30 
31 namespace tesseract {
32 
33 // Enum holding special values of unichar_id. Every unicharset has these.
34 // Warning! Keep in sync with kSpecialUnicharCodes.
39 
41 };
42 
43 // Boolean flag for unichar_insert. It's a bit of a double negative to allow
44 // the default value to be false.
45 enum class OldUncleanUnichars {
46  kFalse,
47  kTrue,
48 };
49 
51 public:
52  // Minimum number of characters used for fragment representation.
53  static const int kMinLen = 6;
54  // Maximum number of characters used for fragment representation.
55  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
56  // Maximum number of fragments per character.
57  static const int kMaxChunks = 5;
58 
59  // Setters and Getters.
60  inline void set_all(const char *unichar, int pos, int total, bool natural) {
61  set_unichar(unichar);
62  set_pos(pos);
63  set_total(total);
64  set_natural(natural);
65  }
66  inline void set_unichar(const char *uch) {
67  strncpy(this->unichar, uch, sizeof(this->unichar));
68  this->unichar[UNICHAR_LEN] = '\0';
69  }
70  inline void set_pos(int p) {
71  this->pos = p;
72  }
73  inline void set_total(int t) {
74  this->total = t;
75  }
76  inline const char *get_unichar() const {
77  return this->unichar;
78  }
79  inline int get_pos() const {
80  return this->pos;
81  }
82  inline int get_total() const {
83  return this->total;
84  }
85 
86  // Returns the string that represents a fragment
87  // with the given unichar, pos and total.
88  static std::string to_string(const char *unichar, int pos, int total,
89  bool natural);
90  // Returns the string that represents this fragment.
91  std::string to_string() const {
92  return to_string(unichar, pos, total, natural);
93  }
94 
95  // Checks whether a fragment has the same unichar,
96  // position and total as the given inputs.
97  inline bool equals(const char *other_unichar, int other_pos,
98  int other_total) const {
99  return (strcmp(this->unichar, other_unichar) == 0 &&
100  this->pos == other_pos && this->total == other_total);
101  }
102  inline bool equals(const CHAR_FRAGMENT *other) const {
103  return this->equals(other->get_unichar(), other->get_pos(),
104  other->get_total());
105  }
106 
107  // Checks whether a given fragment is a continuation of this fragment.
108  // Assumes that the given fragment pointer is not nullptr.
109  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
110  return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
111  this->total == fragment->get_total() &&
112  this->pos == fragment->get_pos() + 1);
113  }
114 
115  // Returns true if this fragment is a beginning fragment.
116  inline bool is_beginning() const {
117  return this->pos == 0;
118  }
119 
120  // Returns true if this fragment is an ending fragment.
121  inline bool is_ending() const {
122  return this->pos == this->total - 1;
123  }
124 
125  // Returns true if the fragment was a separate component to begin with,
126  // ie did not need chopping to be isolated, but may have been separated
127  // out from a multi-outline blob.
128  inline bool is_natural() const {
129  return natural;
130  }
131  void set_natural(bool value) {
132  natural = value;
133  }
134 
135  // Parses the string to see whether it represents a character fragment
136  // (rather than a regular character). If so, allocates memory for a new
137  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
138  // information. Fragments are of the form:
139  // |m|1|2, meaning chunk 1 of 2 of character m, or
140  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
141  // to divide the parts, as they were already separate connected components.
142  //
143  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
144  // instance, otherwise (if the string does not represent a fragment or it
145  // looks like it does, but parsing it as a fragment fails) returns nullptr.
146  //
147  // Note: The caller is responsible for deallocating memory
148  // associated with the returned pointer.
149  static CHAR_FRAGMENT *parse_from_string(const char *str);
150 
151 private:
152  char unichar[UNICHAR_LEN + 1];
153  // True if the fragment was a separate component to begin with,
154  // ie did not need chopping to be isolated, but may have been separated
155  // out from a multi-outline blob.
156  bool natural;
157  int16_t pos; // fragment position in the character
158  int16_t total; // total number of fragments in the character
159 };
160 
161 // The UNICHARSET class is an utility class for Tesseract that holds the
162 // set of characters that are used by the engine. Each character is identified
163 // by a unique number, from 0 to (size - 1).
165 public:
166  // Custom list of characters and their ligature forms (UTF8)
167  // These map to unicode values in the private use area (PUC) and are supported
168  // by only few font families (eg. Wyld, Adobe Caslon Pro).
169  static const char *kCustomLigatures[][2];
170 
171  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
172  static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
173 
174  // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
175  enum Direction {
176  U_LEFT_TO_RIGHT = 0,
177  U_RIGHT_TO_LEFT = 1,
178  U_EUROPEAN_NUMBER = 2,
179  U_EUROPEAN_NUMBER_SEPARATOR = 3,
180  U_EUROPEAN_NUMBER_TERMINATOR = 4,
181  U_ARABIC_NUMBER = 5,
182  U_COMMON_NUMBER_SEPARATOR = 6,
183  U_BLOCK_SEPARATOR = 7,
184  U_SEGMENT_SEPARATOR = 8,
185  U_WHITE_SPACE_NEUTRAL = 9,
186  U_OTHER_NEUTRAL = 10,
187  U_LEFT_TO_RIGHT_EMBEDDING = 11,
188  U_LEFT_TO_RIGHT_OVERRIDE = 12,
189  U_RIGHT_TO_LEFT_ARABIC = 13,
190  U_RIGHT_TO_LEFT_EMBEDDING = 14,
191  U_RIGHT_TO_LEFT_OVERRIDE = 15,
192  U_POP_DIRECTIONAL_FORMAT = 16,
193  U_DIR_NON_SPACING_MARK = 17,
194  U_BOUNDARY_NEUTRAL = 18,
195  U_FIRST_STRONG_ISOLATE = 19,
196  U_LEFT_TO_RIGHT_ISOLATE = 20,
197  U_RIGHT_TO_LEFT_ISOLATE = 21,
198  U_POP_DIRECTIONAL_ISOLATE = 22,
199 #ifndef U_HIDE_DEPRECATED_API
200  U_CHAR_DIRECTION_COUNT
201 #endif // U_HIDE_DEPRECATED_API
202  };
203 
204  // Create an empty UNICHARSET
205  UNICHARSET();
206 
207  ~UNICHARSET();
208 
209  // Return the UNICHAR_ID of a given unichar representation within the
210  // UNICHARSET.
211  UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;
212 
213  // Return the UNICHAR_ID of a given unichar representation within the
214  // UNICHARSET. Only the first length characters from unichar_repr are used.
215  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
216 
217  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
218  // while leaving the rest of the string encodable. Returns 0 if the
219  // beginning of the string is not encodable.
220  // WARNING: this function now encodes the whole string for precision.
221  // Use encode_string in preference to repeatedly calling step.
222  int step(const char *str) const;
223 
224  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
225  // If not encodable, write the first byte offset which cannot be converted
226  // into the second (return) argument.
227  bool encodable_string(const char *str, unsigned *first_bad_position) const;
228 
229  // Encodes the given UTF-8 string with this UNICHARSET.
230  // Any part of the string that cannot be encoded (because the utf8 can't
231  // be broken up into pieces that are in the unicharset) then:
232  // if give_up_on_failure, stops and returns a partial encoding,
233  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
234  // Returns true if the encoding succeeds completely, false if there is at
235  // least one failure.
236  // If lengths is not nullptr, then it is filled with the corresponding
237  // byte length of each encoded UNICHAR_ID.
238  // If encoded_length is not nullptr then on return it contains the length of
239  // str that was encoded. (if give_up_on_failure the location of the first
240  // failure, otherwise strlen(str).)
241  // WARNING: Caller must guarantee that str has already been cleaned of codes
242  // that do not belong in the unicharset, or encoding may fail.
243  // Use CleanupString to perform the cleaning.
244  bool encode_string(const char *str, bool give_up_on_failure,
245  std::vector<UNICHAR_ID> *encoding,
246  std::vector<char> *lengths,
247  unsigned *encoded_length) const;
248 
249  // Return the unichar representation corresponding to the given UNICHAR_ID
250  // within the UNICHARSET.
251  const char *id_to_unichar(UNICHAR_ID id) const;
252 
253  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
254  // resolving any private encodings internal to Tesseract. This method is
255  // preferable to id_to_unichar for outputting text that will be visible to
256  // external applications.
257  const char *id_to_unichar_ext(UNICHAR_ID id) const;
258 
259  // Return a string that reformats the utf8 str into the str followed
260  // by its hex unicodes.
261  static std::string debug_utf8_str(const char *str);
262 
263  // Removes/replaces content that belongs in rendered text, but not in the
264  // unicharset.
265  static std::string CleanupString(const char *utf8_str) {
266  return CleanupString(utf8_str, strlen(utf8_str));
267  }
268  static std::string CleanupString(const char *utf8_str, size_t length);
269 
270  // Return a string containing debug information on the unichar, including
271  // the id_to_unichar, its hex unicodes and the properties.
272  std::string debug_str(UNICHAR_ID id) const;
273  std::string debug_str(const char *unichar_repr) const {
274  return debug_str(unichar_to_id(unichar_repr));
275  }
276 
277  // Adds a unichar representation to the set. If old_style is true, then
278  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
279  // characters are ignored/skipped as if they don't exist and n-grams that
280  // can already be encoded are not added.
281  void unichar_insert(const char *const unichar_repr,
282  OldUncleanUnichars old_style);
283  void unichar_insert(const char *const unichar_repr) {
284  unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
285  }
286  // Adds a unichar representation to the set. Avoids setting old_style to true,
287  // unless it is necessary to make the new unichar get added.
288  void unichar_insert_backwards_compatible(const char *const unichar_repr) {
289  std::string cleaned = CleanupString(unichar_repr);
290  if (cleaned != unichar_repr) {
291  unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
292  } else {
293  auto old_size = size();
294  unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
295  if (size() == old_size) {
296  unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
297  }
298  }
299  }
300 
301  // Return true if the given unichar id exists within the set.
302  // Relies on the fact that unichar ids are contiguous in the unicharset.
303  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
304  return static_cast<size_t>(unichar_id) < unichars.size();
305  }
306 
307  // Return true if the given unichar representation exists within the set.
308  bool contains_unichar(const char *const unichar_repr) const;
309  bool contains_unichar(const char *const unichar_repr, int length) const;
310 
311  // Return true if the given unichar representation corresponds to the given
312  // UNICHAR_ID within the set.
313  bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;
314 
315  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
317  for (auto &unichar : unichars) {
318  delete unichar.properties.fragment;
319  unichar.properties.fragment = nullptr;
320  }
321  }
322 
323  // Clear the UNICHARSET (all the previous data is lost).
324  void clear() {
325  if (script_table != nullptr) {
326  for (int i = 0; i < script_table_size_used; ++i) {
327  delete[] script_table[i];
328  }
329  delete[] script_table;
330  script_table = nullptr;
331  script_table_size_used = 0;
332  }
333  script_table_size_reserved = 0;
334  delete_pointers_in_unichars();
335  unichars.clear();
336  ids.clear();
337  top_bottom_set_ = false;
338  script_has_upper_lower_ = false;
339  script_has_xheight_ = false;
340  old_style_included_ = false;
341  null_sid_ = 0;
342  common_sid_ = 0;
343  latin_sid_ = 0;
344  cyrillic_sid_ = 0;
345  greek_sid_ = 0;
346  han_sid_ = 0;
347  hiragana_sid_ = 0;
348  katakana_sid_ = 0;
349  thai_sid_ = 0;
350  hangul_sid_ = 0;
351  default_sid_ = 0;
352  }
353 
354  // Return the size of the set (the number of different UNICHAR it holds).
355  size_t size() const {
356  return unichars.size();
357  }
358 
359  // Opens the file indicated by filename and saves unicharset to that file.
360  // Returns true if the operation is successful.
361  bool save_to_file(const char *const filename) const {
362  FILE *file = fopen(filename, "w+b");
363  if (file == nullptr) {
364  return false;
365  }
366  bool result = save_to_file(file);
367  fclose(file);
368  return result;
369  }
370 
371  // Saves the content of the UNICHARSET to the given file.
372  // Returns true if the operation is successful.
373  bool save_to_file(FILE *file) const {
374  std::string str;
375  return save_to_string(str) &&
376  tesseract::Serialize(file, &str[0], str.length());
377  }
378 
380  std::string str;
381  return save_to_string(str) && file->Serialize(&str[0], str.length());
382  }
383 
384  // Saves the content of the UNICHARSET to the given string.
385  // Returns true if the operation is successful.
386  bool save_to_string(std::string &str) const;
387 
388  // Opens the file indicated by filename and loads the UNICHARSET
389  // from the given file. The previous data is lost.
390  // Returns true if the operation is successful.
391  bool load_from_file(const char *const filename, bool skip_fragments) {
392  FILE *file = fopen(filename, "rb");
393  if (file == nullptr) {
394  return false;
395  }
396  bool result = load_from_file(file, skip_fragments);
397  fclose(file);
398  return result;
399  }
400  // returns true if the operation is successful.
401  bool load_from_file(const char *const filename) {
402  return load_from_file(filename, false);
403  }
404 
405  // Loads the UNICHARSET from the given file. The previous data is lost.
406  // Returns true if the operation is successful.
407  bool load_from_file(FILE *file, bool skip_fragments);
408  bool load_from_file(FILE *file) {
409  return load_from_file(file, false);
410  }
411  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
412 
413  // Sets up internal data after loading the file, based on the char
414  // properties. Called from load_from_file, but also needs to be run
415  // during set_unicharset_properties.
416  void post_load_setup();
417 
418  // Returns true if right_to_left scripts are significant in the unicharset,
419  // but without being so sensitive that "universal" unicharsets containing
420  // characters from many scripts, like orientation and script detection,
421  // look like they are right_to_left.
422  bool major_right_to_left() const;
423 
424  // Set a whitelist and/or blacklist of characters to recognize.
425  // An empty or nullptr whitelist enables everything (minus any blacklist).
426  // An empty or nullptr blacklist disables nothing.
427  // An empty or nullptr unblacklist has no effect.
428  // The blacklist overrides the whitelist.
429  // The unblacklist overrides the blacklist.
430  // Each list is a string of utf8 character strings. Boundaries between
431  // unicharset units are worked out automatically, and characters not in
432  // the unicharset are silently ignored.
433  void set_black_and_whitelist(const char *blacklist, const char *whitelist,
434  const char *unblacklist);
435 
436  // Set the isalpha property of the given unichar to the given value.
437  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
438  unichars[unichar_id].properties.isalpha = value;
439  }
440 
441  // Set the islower property of the given unichar to the given value.
442  void set_islower(UNICHAR_ID unichar_id, bool value) {
443  unichars[unichar_id].properties.islower = value;
444  }
445 
446  // Set the isupper property of the given unichar to the given value.
447  void set_isupper(UNICHAR_ID unichar_id, bool value) {
448  unichars[unichar_id].properties.isupper = value;
449  }
450 
451  // Set the isdigit property of the given unichar to the given value.
452  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
453  unichars[unichar_id].properties.isdigit = value;
454  }
455 
456  // Set the ispunctuation property of the given unichar to the given value.
457  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
458  unichars[unichar_id].properties.ispunctuation = value;
459  }
460 
461  // Set the isngram property of the given unichar to the given value.
462  void set_isngram(UNICHAR_ID unichar_id, bool value) {
463  unichars[unichar_id].properties.isngram = value;
464  }
465 
466  // Set the script name of the given unichar to the given value.
467  // Value is copied and thus can be a temporary;
468  void set_script(UNICHAR_ID unichar_id, const char *value) {
469  unichars[unichar_id].properties.script_id = add_script(value);
470  }
471 
472  // Set other_case unichar id in the properties for the given unichar id.
473  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
474  unichars[unichar_id].properties.other_case = other_case;
475  }
476 
477  // Set the direction property of the given unichar to the given value.
479  unichars[unichar_id].properties.direction = value;
480  }
481 
482  // Set mirror unichar id in the properties for the given unichar id.
483  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
484  unichars[unichar_id].properties.mirror = mirror;
485  }
486 
487  // Record normalized version of unichar with the given unichar_id.
488  void set_normed(UNICHAR_ID unichar_id, const char *normed) {
489  unichars[unichar_id].properties.normed = normed;
490  unichars[unichar_id].properties.normed_ids.clear();
491  }
492  // Sets the normed_ids vector from the normed string. normed_ids is not
493  // stored in the file, and needs to be set when the UNICHARSET is loaded.
494  void set_normed_ids(UNICHAR_ID unichar_id);
495 
496  // Return the isalpha property of the given unichar.
497  bool get_isalpha(UNICHAR_ID unichar_id) const {
498  if (INVALID_UNICHAR_ID == unichar_id) {
499  return false;
500  }
501  ASSERT_HOST(contains_unichar_id(unichar_id));
502  return unichars[unichar_id].properties.isalpha;
503  }
504 
505  // Return the islower property of the given unichar.
506  bool get_islower(UNICHAR_ID unichar_id) const {
507  if (INVALID_UNICHAR_ID == unichar_id) {
508  return false;
509  }
510  ASSERT_HOST(contains_unichar_id(unichar_id));
511  return unichars[unichar_id].properties.islower;
512  }
513 
514  // Return the isupper property of the given unichar.
515  bool get_isupper(UNICHAR_ID unichar_id) const {
516  if (INVALID_UNICHAR_ID == unichar_id) {
517  return false;
518  }
519  ASSERT_HOST(contains_unichar_id(unichar_id));
520  return unichars[unichar_id].properties.isupper;
521  }
522 
523  // Return the isdigit property of the given unichar.
524  bool get_isdigit(UNICHAR_ID unichar_id) const {
525  if (INVALID_UNICHAR_ID == unichar_id) {
526  return false;
527  }
528  ASSERT_HOST(contains_unichar_id(unichar_id));
529  return unichars[unichar_id].properties.isdigit;
530  }
531 
532  // Return the ispunctuation property of the given unichar.
533  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
534  if (INVALID_UNICHAR_ID == unichar_id) {
535  return false;
536  }
537  ASSERT_HOST(contains_unichar_id(unichar_id));
538  return unichars[unichar_id].properties.ispunctuation;
539  }
540 
541  // Return the isngram property of the given unichar.
542  bool get_isngram(UNICHAR_ID unichar_id) const {
543  if (INVALID_UNICHAR_ID == unichar_id) {
544  return false;
545  }
546  ASSERT_HOST(contains_unichar_id(unichar_id));
547  return unichars[unichar_id].properties.isngram;
548  }
549 
550  // Returns whether the unichar id represents a unicode value in the private
551  // use area.
552  bool get_isprivate(UNICHAR_ID unichar_id) const;
553 
554  // Returns true if the ids have useful min/max top/bottom values.
555  bool top_bottom_useful() const {
556  return top_bottom_set_;
557  }
558  // Sets all ranges to empty, so they can be expanded to set the values.
559  void set_ranges_empty();
560  // Sets all the properties for this unicharset given a src_unicharset with
561  // everything set. The unicharsets don't have to be the same, and graphemes
562  // are correctly accounted for.
564  PartialSetPropertiesFromOther(0, src);
565  }
566  // Sets properties from Other, starting only at the given index.
567  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);
568  // Expands the tops and bottoms and widths for this unicharset given a
569  // src_unicharset with ranges in it. The unicharsets don't have to be the
570  // same, and graphemes are correctly accounted for.
571  void ExpandRangesFromOther(const UNICHARSET &src);
572  // Makes this a copy of src. Clears this completely first, so the automattic
573  // ids will not be present in this if not in src.
574  void CopyFrom(const UNICHARSET &src);
575  // For each id in src, if it does not occur in this, add it, as in
576  // SetPropertiesFromOther, otherwise expand the ranges, as in
577  // ExpandRangesFromOther.
578  void AppendOtherUnicharset(const UNICHARSET &src);
579  // Returns true if the acceptable ranges of the tops of the characters do
580  // not overlap, making their x-height calculations distinct.
581  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
582  // Returns the min and max bottom and top of the given unichar in
583  // baseline-normalized coordinates, ie, where the baseline is
584  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
585  // (See normalis.h for the definitions).
586  void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,
587  int *min_top, int *max_top) const {
588  if (INVALID_UNICHAR_ID == unichar_id) {
589  *min_bottom = *min_top = 0;
590  *max_bottom = *max_top = 256; // kBlnCellHeight
591  return;
592  }
593  ASSERT_HOST(contains_unichar_id(unichar_id));
594  *min_bottom = unichars[unichar_id].properties.min_bottom;
595  *max_bottom = unichars[unichar_id].properties.max_bottom;
596  *min_top = unichars[unichar_id].properties.min_top;
597  *max_top = unichars[unichar_id].properties.max_top;
598  }
599  void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,
600  int min_top, int max_top) {
601  unichars[unichar_id].properties.min_bottom =
602  ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603  unichars[unichar_id].properties.max_bottom =
604  ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605  unichars[unichar_id].properties.min_top =
606  ClipToRange<int>(min_top, 0, UINT8_MAX);
607  unichars[unichar_id].properties.max_top =
608  ClipToRange<int>(max_top, 0, UINT8_MAX);
609  }
610  // Returns the width stats (as mean, sd) of the given unichar relative to the
611  // median advance of all characters in the character set.
612  void get_width_stats(UNICHAR_ID unichar_id, float *width,
613  float *width_sd) const {
614  if (INVALID_UNICHAR_ID == unichar_id) {
615  *width = 0.0f;
616  *width_sd = 0.0f;
617  ;
618  return;
619  }
620  ASSERT_HOST(contains_unichar_id(unichar_id));
621  *width = unichars[unichar_id].properties.width;
622  *width_sd = unichars[unichar_id].properties.width_sd;
623  }
624  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
625  unichars[unichar_id].properties.width = width;
626  unichars[unichar_id].properties.width_sd = width_sd;
627  }
628  // Returns the stats of the x-bearing (as mean, sd) of the given unichar
629  // relative to the median advance of all characters in the character set.
630  void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,
631  float *bearing_sd) const {
632  if (INVALID_UNICHAR_ID == unichar_id) {
633  *bearing = *bearing_sd = 0.0f;
634  return;
635  }
636  ASSERT_HOST(contains_unichar_id(unichar_id));
637  *bearing = unichars[unichar_id].properties.bearing;
638  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
639  }
640  void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,
641  float bearing_sd) {
642  unichars[unichar_id].properties.bearing = bearing;
643  unichars[unichar_id].properties.bearing_sd = bearing_sd;
644  }
645  // Returns the stats of the x-advance of the given unichar (as mean, sd)
646  // relative to the median advance of all characters in the character set.
647  void get_advance_stats(UNICHAR_ID unichar_id, float *advance,
648  float *advance_sd) const {
649  if (INVALID_UNICHAR_ID == unichar_id) {
650  *advance = *advance_sd = 0;
651  return;
652  }
653  ASSERT_HOST(contains_unichar_id(unichar_id));
654  *advance = unichars[unichar_id].properties.advance;
655  *advance_sd = unichars[unichar_id].properties.advance_sd;
656  }
657  void set_advance_stats(UNICHAR_ID unichar_id, float advance,
658  float advance_sd) {
659  unichars[unichar_id].properties.advance = advance;
660  unichars[unichar_id].properties.advance_sd = advance_sd;
661  }
662  // Returns true if the font metrics properties are empty.
663  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
664  return unichars[unichar_id].properties.AnyRangeEmpty();
665  }
666 
667  // Returns true if the script of the given id is space delimited.
668  // Returns false for Han and Thai scripts.
669  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
670  if (INVALID_UNICHAR_ID == unichar_id) {
671  return true;
672  }
673  int script_id = get_script(unichar_id);
674  return script_id != han_sid_ && script_id != thai_sid_ &&
675  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
676  script_id != katakana_sid_;
677  }
678 
679  // Return the script name of the given unichar.
680  // The returned pointer will always be the same for the same script, it's
681  // managed by unicharset and thus MUST NOT be deleted
682  int get_script(UNICHAR_ID unichar_id) const {
683  if (INVALID_UNICHAR_ID == unichar_id) {
684  return null_sid_;
685  }
686  ASSERT_HOST(contains_unichar_id(unichar_id));
687  return unichars[unichar_id].properties.script_id;
688  }
689 
690  // Return the character properties, eg. alpha/upper/lower/digit/punct,
691  // as a bit field of unsigned int.
692  unsigned int get_properties(UNICHAR_ID unichar_id) const;
693 
694  // Return the character property as a single char. If a character has
695  // multiple attributes, the main property is defined by the following order:
696  // upper_case : 'A'
697  // lower_case : 'a'
698  // alpha : 'x'
699  // digit : '0'
700  // punctuation: 'p'
701  char get_chartype(UNICHAR_ID unichar_id) const;
702 
703  // Get other_case unichar id in the properties for the given unichar id.
705  if (INVALID_UNICHAR_ID == unichar_id) {
706  return INVALID_UNICHAR_ID;
707  }
708  ASSERT_HOST(contains_unichar_id(unichar_id));
709  return unichars[unichar_id].properties.other_case;
710  }
711 
712  // Returns the direction property of the given unichar.
713  Direction get_direction(UNICHAR_ID unichar_id) const {
714  if (INVALID_UNICHAR_ID == unichar_id) {
716  }
717  ASSERT_HOST(contains_unichar_id(unichar_id));
718  return unichars[unichar_id].properties.direction;
719  }
720 
721  // Get mirror unichar id in the properties for the given unichar id.
722  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
723  if (INVALID_UNICHAR_ID == unichar_id) {
724  return INVALID_UNICHAR_ID;
725  }
726  ASSERT_HOST(contains_unichar_id(unichar_id));
727  return unichars[unichar_id].properties.mirror;
728  }
729 
730  // Returns UNICHAR_ID of the corresponding lower-case unichar.
731  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
732  if (INVALID_UNICHAR_ID == unichar_id) {
733  return INVALID_UNICHAR_ID;
734  }
735  ASSERT_HOST(contains_unichar_id(unichar_id));
736  if (unichars[unichar_id].properties.islower) {
737  return unichar_id;
738  }
739  return unichars[unichar_id].properties.other_case;
740  }
741 
742  // Returns UNICHAR_ID of the corresponding upper-case unichar.
743  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
744  if (INVALID_UNICHAR_ID == unichar_id) {
745  return INVALID_UNICHAR_ID;
746  }
747  ASSERT_HOST(contains_unichar_id(unichar_id));
748  if (unichars[unichar_id].properties.isupper) {
749  return unichar_id;
750  }
751  return unichars[unichar_id].properties.other_case;
752  }
753 
754  // Returns true if this UNICHARSET has the special codes in
755  // SpecialUnicharCodes available. If false then there are normal unichars
756  // at these codes and they should not be used.
757  bool has_special_codes() const {
758  return get_fragment(UNICHAR_BROKEN) != nullptr &&
759  strcmp(id_to_unichar(UNICHAR_BROKEN),
760  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
761  }
762 
763  // Returns true if there are any repeated unicodes in the normalized
764  // text of any unichar-id in the unicharset.
765  bool AnyRepeatedUnicodes() const;
766 
767  // Return a pointer to the CHAR_FRAGMENT class if the given
768  // unichar id represents a character fragment.
769  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
770  if (INVALID_UNICHAR_ID == unichar_id) {
771  return nullptr;
772  }
773  ASSERT_HOST(contains_unichar_id(unichar_id));
774  return unichars[unichar_id].properties.fragment;
775  }
776 
777  // Return the isalpha property of the given unichar representation.
778  bool get_isalpha(const char *const unichar_repr) const {
779  return get_isalpha(unichar_to_id(unichar_repr));
780  }
781 
782  // Return the islower property of the given unichar representation.
783  bool get_islower(const char *const unichar_repr) const {
784  return get_islower(unichar_to_id(unichar_repr));
785  }
786 
787  // Return the isupper property of the given unichar representation.
788  bool get_isupper(const char *const unichar_repr) const {
789  return get_isupper(unichar_to_id(unichar_repr));
790  }
791 
792  // Return the isdigit property of the given unichar representation.
793  bool get_isdigit(const char *const unichar_repr) const {
794  return get_isdigit(unichar_to_id(unichar_repr));
795  }
796 
797  // Return the ispunctuation property of the given unichar representation.
798  bool get_ispunctuation(const char *const unichar_repr) const {
799  return get_ispunctuation(unichar_to_id(unichar_repr));
800  }
801 
802  // Return the character properties, eg. alpha/upper/lower/digit/punct,
803  // of the given unichar representation
804  unsigned int get_properties(const char *const unichar_repr) const {
805  return get_properties(unichar_to_id(unichar_repr));
806  }
807 
808  char get_chartype(const char *const unichar_repr) const {
809  return get_chartype(unichar_to_id(unichar_repr));
810  }
811 
812  // Return the script name of the given unichar representation.
813  // The returned pointer will always be the same for the same script, it's
814  // managed by unicharset and thus MUST NOT be deleted
815  int get_script(const char *const unichar_repr) const {
816  return get_script(unichar_to_id(unichar_repr));
817  }
818 
819  // Return a pointer to the CHAR_FRAGMENT class struct if the given
820  // unichar representation represents a character fragment.
821  const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {
822  if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
823  !ids.contains(unichar_repr, false)) {
824  return nullptr;
825  }
826  return get_fragment(unichar_to_id(unichar_repr));
827  }
828 
829  // Return the isalpha property of the given unichar representation.
830  // Only the first length characters from unichar_repr are used.
831  bool get_isalpha(const char *const unichar_repr, int length) const {
832  return get_isalpha(unichar_to_id(unichar_repr, length));
833  }
834 
835  // Return the islower property of the given unichar representation.
836  // Only the first length characters from unichar_repr are used.
837  bool get_islower(const char *const unichar_repr, int length) const {
838  return get_islower(unichar_to_id(unichar_repr, length));
839  }
840 
841  // Return the isupper property of the given unichar representation.
842  // Only the first length characters from unichar_repr are used.
843  bool get_isupper(const char *const unichar_repr, int length) const {
844  return get_isupper(unichar_to_id(unichar_repr, length));
845  }
846 
847  // Return the isdigit property of the given unichar representation.
848  // Only the first length characters from unichar_repr are used.
849  bool get_isdigit(const char *const unichar_repr, int length) const {
850  return get_isdigit(unichar_to_id(unichar_repr, length));
851  }
852 
853  // Return the ispunctuation property of the given unichar representation.
854  // Only the first length characters from unichar_repr are used.
855  bool get_ispunctuation(const char *const unichar_repr, int length) const {
856  return get_ispunctuation(unichar_to_id(unichar_repr, length));
857  }
858 
859  // Returns normalized version of unichar with the given unichar_id.
860  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
861  if (unichar_id == UNICHAR_SPACE) {
862  return " ";
863  }
864  return unichars[unichar_id].properties.normed.c_str();
865  }
866  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
867  // version of the given id. There may be more than one UNICHAR_ID in the
868  // vector if unichar_id represents a ligature.
869  const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {
870  return unichars[unichar_id].properties.normed_ids;
871  }
872 
873  // Return the script name of the given unichar representation.
874  // Only the first length characters from unichar_repr are used.
875  // The returned pointer will always be the same for the same script, it's
876  // managed by unicharset and thus MUST NOT be deleted
877  int get_script(const char *const unichar_repr, int length) const {
878  return get_script(unichar_to_id(unichar_repr, length));
879  }
880 
881  // Return the (current) number of scripts in the script table
882  int get_script_table_size() const {
883  return script_table_size_used;
884  }
885 
886  // Return the script string from its id
887  const char *get_script_from_script_id(int id) const {
888  if (id >= script_table_size_used || id < 0) {
889  return null_script;
890  }
891  return script_table[id];
892  }
893 
894  // Returns the id from the name of the script, or 0 if script is not found.
895  // Note that this is an expensive operation since it involves iteratively
896  // comparing strings in the script table. To avoid dependency on STL, we
897  // won't use a hash. Instead, the calling function can use this to lookup
898  // and save the ID for relevant scripts for fast comparisons later.
899  int get_script_id_from_name(const char *script_name) const;
900 
901  // Return true if the given script is the null script
902  bool is_null_script(const char *script) const {
903  return script == null_script;
904  }
905 
906  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
907  // then the returned pointer will be the same.
908  // The script parameter is copied and thus can be a temporary.
909  int add_script(const char *script);
910 
911  // Return the enabled property of the given unichar.
912  bool get_enabled(UNICHAR_ID unichar_id) const {
913  ASSERT_HOST(contains_unichar_id(unichar_id));
914  return unichars[unichar_id].properties.enabled;
915  }
916 
917  int null_sid() const {
918  return null_sid_;
919  }
920  int common_sid() const {
921  return common_sid_;
922  }
923  int latin_sid() const {
924  return latin_sid_;
925  }
926  int cyrillic_sid() const {
927  return cyrillic_sid_;
928  }
929  int greek_sid() const {
930  return greek_sid_;
931  }
932  int han_sid() const {
933  return han_sid_;
934  }
935  int hiragana_sid() const {
936  return hiragana_sid_;
937  }
938  int katakana_sid() const {
939  return katakana_sid_;
940  }
941  int thai_sid() const {
942  return thai_sid_;
943  }
944  int hangul_sid() const {
945  return hangul_sid_;
946  }
947  int default_sid() const {
948  return default_sid_;
949  }
950 
951  // Returns true if the unicharset has the concept of upper/lower case.
952  bool script_has_upper_lower() const {
953  return script_has_upper_lower_;
954  }
955  // Returns true if the unicharset has the concept of x-height.
956  // script_has_xheight can be true even if script_has_upper_lower is not,
957  // when the script has a sufficiently predominant top line with ascenders,
958  // such as Devanagari and Thai.
959  bool script_has_xheight() const {
960  return script_has_xheight_;
961  }
962 
963 private:
964  struct TESS_API UNICHAR_PROPERTIES {
965  UNICHAR_PROPERTIES();
966  // Initializes all properties to sensible default values.
967  void Init();
968  // Sets all ranges wide open. Initialization default in case there are
969  // no useful values available.
970  void SetRangesOpen();
971  // Sets all ranges to empty. Used before expanding with font-based data.
972  void SetRangesEmpty();
973  // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
974  // is empty.
975  bool AnyRangeEmpty() const;
976  // Expands the ranges with the ranges from the src properties.
977  void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);
978  // Copies the properties from src into this.
979  void CopyFrom(const UNICHAR_PROPERTIES &src);
980 
981  bool isalpha;
982  bool islower;
983  bool isupper;
984  bool isdigit;
985  bool ispunctuation;
986  bool isngram;
987  bool enabled;
988  // Possible limits of the top and bottom of the bounding box in
989  // baseline-normalized coordinates, ie, where the baseline is
990  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
991  // (See normalis.h for the definitions).
992  uint8_t min_bottom;
993  uint8_t max_bottom;
994  uint8_t min_top;
995  uint8_t max_top;
996  // Statistics of the widths of bounding box, relative to the median advance.
997  float width;
998  float width_sd;
999  // Stats of the x-bearing and advance, also relative to the median advance.
1000  float bearing;
1001  float bearing_sd;
1002  float advance;
1003  float advance_sd;
1004  int script_id;
1005  UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
1006  Direction direction; // direction of this unichar
1007  // Mirror property is useful for reverse DAWG lookup for words in
1008  // right-to-left languages (e.g. "(word)" would be in
1009  // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
1010  // However, what we want in our DAWG is
1011  // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
1012  // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
1013  UNICHAR_ID mirror;
1014  // A string of unichar_ids that represent the corresponding normed string.
1015  // For awkward characters like em-dash, this gives hyphen.
1016  // For ligatures, this gives the string of normal unichars.
1017  std::vector<UNICHAR_ID> normed_ids;
1018  std::string normed; // normalized version of this unichar
1019  // Contains meta information about the fragment if a unichar represents
1020  // a fragment of a character, otherwise should be set to nullptr.
1021  // It is assumed that character fragments are added to the unicharset
1022  // after the corresponding 'base' characters.
1023  CHAR_FRAGMENT *fragment;
1024  };
1025 
1026  struct UNICHAR_SLOT {
1027  char representation[UNICHAR_LEN + 1];
1028  UNICHAR_PROPERTIES properties;
1029  };
1030 
1031  // Internal recursive version of encode_string above.
1032  // str is the start of the whole string.
1033  // str_index is the current position in str.
1034  // str_length is the length of str.
1035  // encoding is a working encoding of str.
1036  // lengths is a working set of lengths of each element of encoding.
1037  // best_total_length is the longest length of str that has been successfully
1038  // encoded so far.
1039  // On return:
1040  // best_encoding contains the encoding that used the longest part of str.
1041  // best_lengths (may be null) contains the lengths of best_encoding.
1042  void encode_string(const char *str, int str_index, int str_length,
1043  std::vector<UNICHAR_ID> *encoding,
1044  std::vector<char> *lengths, unsigned *best_total_length,
1045  std::vector<UNICHAR_ID> *best_encoding,
1046  std::vector<char> *best_lengths) const;
1047 
1048  // Gets the properties for a grapheme string, combining properties for
1049  // multiple characters in a meaningful way where possible.
1050  // Returns false if no valid match was found in the unicharset.
1051  // NOTE that script_id, mirror, and other_case refer to this unicharset on
1052  // return and will need redirecting if the target unicharset is different.
1053  bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;
1054 
1055  // Load ourselves from a "file" where our only interface to the file is
1056  // an implementation of fgets(). This is the parsing primitive accessed by
1057  // the public routines load_from_file().
1058  bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,
1059  bool skip_fragments);
1060 
1061  // List of mappings to make when ingesting strings from the outside.
1062  // The substitutions clean up text that should exists for rendering of
1063  // synthetic data, but not in the recognition set.
1064  static const char *kCleanupMaps[][2];
1065  static const char *null_script;
1066 
1067  std::vector<UNICHAR_SLOT> unichars;
1068  UNICHARMAP ids;
1069  char **script_table;
1070  int script_table_size_used;
1071  int script_table_size_reserved;
1072  // True if the unichars have their tops/bottoms set.
1073  bool top_bottom_set_;
1074  // True if the unicharset has significant upper/lower case chars.
1075  bool script_has_upper_lower_;
1076  // True if the unicharset has a significant mean-line with significant
1077  // ascenders above that.
1078  bool script_has_xheight_;
1079  // True if the set contains chars that would be changed by the cleanup.
1080  bool old_style_included_;
1081 
1082  // A few convenient script name-to-id mapping without using hash.
1083  // These are initialized when unicharset file is loaded. Anything
1084  // missing from this list can be looked up using get_script_id_from_name.
1085  int null_sid_;
1086  int common_sid_;
1087  int latin_sid_;
1088  int cyrillic_sid_;
1089  int greek_sid_;
1090  int han_sid_;
1091  int hiragana_sid_;
1092  int katakana_sid_;
1093  int thai_sid_;
1094  int hangul_sid_;
1095  // The most frequently occurring script in the charset.
1096  int default_sid_;
1097 };
1098 
1099 } // namespace tesseract
1100 
1101 #endif // TESSERACT_CCUTIL_UNICHARSET_H_
#define UNICHAR_LEN
Definition: unichar.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:59
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
OldUncleanUnichars
Definition: unicharset.h:45
int UNICHAR_ID
Definition: unichar.h:36
SpecialUnicharCodes
Definition: unicharset.h:35
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
@ UNICHAR_JOINED
Definition: unicharset.h:37
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:97
void set_unichar(const char *uch)
Definition: unicharset.h:66
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:60
bool is_ending() const
Definition: unicharset.h:121
void set_natural(bool value)
Definition: unicharset.h:131
const char * get_unichar() const
Definition: unicharset.h:76
std::string to_string() const
Definition: unicharset.h:91
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:109
bool is_natural() const
Definition: unicharset.h:128
bool is_beginning() const
Definition: unicharset.h:116
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:102
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:783
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:821
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:778
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:283
std::string debug_str(const char *unichar_repr) const
Definition: unicharset.h:273
int greek_sid() const
Definition: unicharset.h:929
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:483
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:860
void delete_pointers_in_unichars()
Definition: unicharset.h:316
int default_sid() const
Definition: unicharset.h:947
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool script_has_xheight() const
Definition: unicharset.h:959
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:877
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:869
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:887
int common_sid() const
Definition: unicharset.h:920
int han_sid() const
Definition: unicharset.h:932
int get_script_table_size() const
Definition: unicharset.h:882
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:843
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool has_special_codes() const
Definition: unicharset.h:757
int cyrillic_sid() const
Definition: unicharset.h:926
int hiragana_sid() const
Definition: unicharset.h:935
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:447
bool script_has_upper_lower() const
Definition: unicharset.h:952
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:488
bool is_null_script(const char *script) const
Definition: unicharset.h:902
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:815
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:798
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:647
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:793
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:624
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:849
int null_sid() const
Definition: unicharset.h:917
int hangul_sid() const
Definition: unicharset.h:944
bool load_from_file(FILE *file)
Definition: unicharset.h:408
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:599
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:478
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:457
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:855
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:804
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:788
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
int latin_sid() const
Definition: unicharset.h:923
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:743
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:630
int katakana_sid() const
Definition: unicharset.h:938
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:437
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
bool top_bottom_useful() const
Definition: unicharset.h:555
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:831
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288
bool save_to_file(FILE *file) const
Definition: unicharset.h:373
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:473
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool load_from_file(const char *const filename)
Definition: unicharset.h:401
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:837
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:912
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:442
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
size_t size() const
Definition: unicharset.h:355
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:669
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:808
int thai_sid() const
Definition: unicharset.h:941
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:563
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:657
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:452
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:379
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:640
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:731
#define TESS_API
Definition: export.h:34