tesseract  5.0.0
unicharset_training_utils.cpp
Go to the documentation of this file.
1 // File: unicharset_training_utils.cpp
3 // Description: Training utilities for UNICHARSET.
4 // Author: Ray Smith
5 // Created: Fri Oct 17 17:09:01 PDT 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
21 
22 #include <cstdlib>
23 #include <cstring>
24 #include <string>
25 #include <vector>
26 
27 #include <tesseract/unichar.h>
28 #include "fileio.h"
29 #include "icuerrorcode.h"
30 #include "normstrngs.h"
31 #include "statistc.h"
32 #include "unicharset.h"
33 #include "unicode/uchar.h" // from libicu
34 #include "unicode/uscript.h" // from libicu
35 
36 namespace tesseract {
37 
38 // Helper sets the character attribute properties and sets up the script table.
39 // Does not set tops and bottoms.
40 void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset) {
41  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
42  // Convert any custom ligatures.
43  const char *unichar_str = unicharset->id_to_unichar(unichar_id);
44  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
45  if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
46  unichar_str = UNICHARSET::kCustomLigatures[i][0];
47  break;
48  }
49  }
50 
51  // Convert the unichar to UTF32 representation
52  std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);
53 
54  // Assume that if the property is true for any character in the string,
55  // then it holds for the whole "character".
56  bool unichar_isalpha = false;
57  bool unichar_islower = false;
58  bool unichar_isupper = false;
59  bool unichar_isdigit = false;
60  bool unichar_ispunct = false;
61 
62  for (char32 u_ch : uni_vector) {
63  if (u_isalpha(u_ch)) {
64  unichar_isalpha = true;
65  }
66  if (u_islower(u_ch)) {
67  unichar_islower = true;
68  }
69  if (u_isupper(u_ch)) {
70  unichar_isupper = true;
71  }
72  if (u_isdigit(u_ch)) {
73  unichar_isdigit = true;
74  }
75  if (u_ispunct(u_ch)) {
76  unichar_ispunct = true;
77  }
78  }
79 
80  unicharset->set_isalpha(unichar_id, unichar_isalpha);
81  unicharset->set_islower(unichar_id, unichar_islower);
82  unicharset->set_isupper(unichar_id, unichar_isupper);
83  unicharset->set_isdigit(unichar_id, unichar_isdigit);
84  unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
85 
87  unicharset->set_script(unichar_id, uscript_getName(uscript_getScript(uni_vector[0], err)));
88 
89  const int num_code_points = uni_vector.size();
90  // Obtain the lower/upper case if needed and record it in the properties.
91  unicharset->set_other_case(unichar_id, unichar_id);
92  if (unichar_islower || unichar_isupper) {
93  std::vector<char32> other_case(num_code_points, 0);
94  for (int i = 0; i < num_code_points; ++i) {
95  // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
96  // However since they deal with UChars (so need a conversion function
97  // from char32 or UTF8string) and require a meaningful locale string,
98  // for now u_tolower()/u_toupper() are used.
99  other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]);
100  }
101  std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
102  UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str());
103  if (other_case_id != INVALID_UNICHAR_ID) {
104  unicharset->set_other_case(unichar_id, other_case_id);
105  } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
106  tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str);
107  }
108  }
109 
110  // Set RTL property and obtain mirror unichar ID from ICU.
111  std::vector<char32> mirrors(num_code_points, 0);
112  for (int i = 0; i < num_code_points; ++i) {
113  mirrors[i] = u_charMirror(uni_vector[i]);
114  if (i == 0) { // set directionality to that of the 1st code point
115  unicharset->set_direction(
116  unichar_id, static_cast<UNICHARSET::Direction>(u_charDirection(uni_vector[i])));
117  }
118  }
119  std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
120  UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
121  if (mirror_uch_id != INVALID_UNICHAR_ID) {
122  unicharset->set_mirror(unichar_id, mirror_uch_id);
123  } else if (report_errors) {
124  tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str);
125  }
126 
127  // Record normalized version of this unichar.
128  std::string normed_str;
129  if (unichar_id != 0 &&
133  &normed_str) &&
134  !normed_str.empty()) {
135  unicharset->set_normed(unichar_id, normed_str.c_str());
136  } else {
137  unicharset->set_normed(unichar_id, unichar_str);
138  }
139  ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
140  }
141  unicharset->post_load_setup();
142 }
143 
144 // Helper sets the properties from universal script unicharsets, if found.
145 void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset) {
146  for (int s = 0; s < unicharset->get_script_table_size(); ++s) {
147  // Load the unicharset for the script if available.
148  std::string filename =
149  script_dir + "/" + unicharset->get_script_from_script_id(s) + ".unicharset";
150  UNICHARSET script_set;
151  if (script_set.load_from_file(filename.c_str())) {
152  unicharset->SetPropertiesFromOther(script_set);
153  } else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {
154  tprintf("Failed to load script unicharset from:%s\n", filename.c_str());
155  }
156  }
157  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {
158  if (unicharset->PropertiesIncomplete(c)) {
159  tprintf("Warning: properties incomplete for index %d = %s\n", c,
160  unicharset->id_to_unichar(c));
161  }
162  }
163 }
164 
165 // Helper gets the combined x-heights string.
166 std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset) {
167  std::string xheights_str;
168  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
169  // Load the xheights for the script if available.
170  std::string filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".xheights";
171  std::string script_heights;
172  if (File::ReadFileToString(filename, &script_heights)) {
173  xheights_str += script_heights;
174  }
175  }
176  return xheights_str;
177 }
178 
179 // Helper to set the properties for an input unicharset file, writes to the
180 // output file. If an appropriate script unicharset can be found in the
181 // script_dir directory, then the tops and bottoms are expanded using the
182 // script unicharset.
183 // If non-empty, xheight data for the fonts are written to the xheights_file.
184 void SetPropertiesForInputFile(const std::string &script_dir,
185  const std::string &input_unicharset_file,
186  const std::string &output_unicharset_file,
187  const std::string &output_xheights_file) {
188  UNICHARSET unicharset;
189 
190  // Load the input unicharset
191  unicharset.load_from_file(input_unicharset_file.c_str());
192  tprintf("Loaded unicharset of size %zu from file %s\n", unicharset.size(),
193  input_unicharset_file.c_str());
194 
195  // Set unichar properties
196  tprintf("Setting unichar properties\n");
197  SetupBasicProperties(true, false, &unicharset);
198  tprintf("Setting script properties\n");
199  SetScriptProperties(script_dir, &unicharset);
200  if (!output_xheights_file.empty()) {
201  std::string xheights_str = GetXheightString(script_dir, unicharset);
202  File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
203  }
204 
205  // Write the output unicharset
206  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
207  unicharset.save_to_file(output_unicharset_file.c_str());
208 }
209 
210 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetPropertiesForInputFile(const std::string &script_dir, const std::string &input_unicharset_file, const std::string &output_unicharset_file, const std::string &output_xheights_file)
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
int UNICHAR_ID
Definition: unichar.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:237
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:483
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:887
int common_sid() const
Definition: unicharset.h:920
int get_script_table_size() const
Definition: unicharset.h:882
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:447
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:488
int null_sid() const
Definition: unicharset.h:917
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:478
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:457
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:437
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:473
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:442
size_t size() const
Definition: unicharset.h:355
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:563
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:452
static bool ReadFileToString(const std::string &filename, std::string *out)
Definition: fileio.cpp:73
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:54