tesseract  5.0.0
validate_indic.cpp
Go to the documentation of this file.
1 #include "validate_indic.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4 
5 namespace tesseract {
6 
7 // Returns whether codes matches the pattern for an Indic Grapheme.
8 // The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
9 // has a BNF for valid syllables (Graphemes) which is modified slightly
10 // for Unicode. Notably U+200C and U+200D are used before/after the
11 // virama/virama to express explicit or soft viramas.
12 // Also the unicode v.9 Malayalam entry states that CZHC can be used in several
13 // Indic languages to request traditional ligatures, and CzHC is Malayalam-
14 // specific for requesting open conjuncts.
15 //
16 // + vowel Grapheme: V[D](v)*
17 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
19  switch (codes_[codes_used_].first) {
21  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
22  case CharClass::kVowel:
24  return ConsumeVowelIfValid();
27  // Apart from within an aksara, joiners are silently dropped.
28  if (report_errors_) {
29  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
30  }
31  ++codes_used_;
32  return true;
33  case CharClass::kOther:
34  UseMultiCode(1);
35  return true;
36  default:
37  if (report_errors_) {
38  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
39  static_cast<int>(codes_[codes_used_].first),
40  codes_[codes_used_].second);
41  }
42  return false;
43  }
44 }
45 
47  if (IsVedicAccent(ch)) {
48  return CharClass::kVedicMark;
49  }
50  if (ch == kZeroWidthNonJoiner) {
52  }
53  if (ch == kZeroWidthJoiner) {
55  }
56  // Offset from the start of the relevant unicode code block aka code page.
57  int base = static_cast<char32>(script_);
58  int off = ch - base;
59  // Anything in another code block is other.
60  if (off < 0 || off >= kIndicCodePageSize) {
61  return CharClass::kOther;
62  }
63  // Exception for Tamil. The aytham character is considered a letter.
64  if (script_ == ViramaScript::kTamil && off == 0x03) {
65  return CharClass::kVowel;
66  }
67  if (off < 0x4) {
69  }
71  // Sinhala is an exception.
72  if (off <= 0x19) {
73  return CharClass::kVowel;
74  }
75  if (off <= 0x49) {
76  return CharClass::kConsonant;
77  }
78  if (off == 0x4a) {
79  return CharClass::kVirama;
80  }
81  if (off <= 0x5f) {
82  return CharClass::kMatra;
83  }
84  } else {
85  if (off <= 0x14 || off == 0x50) {
86  return CharClass::kVowel;
87  }
88  if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) {
89  return CharClass::kConsonant;
90  }
91  // Sinhala doesn't have Nukta or Avagraha.
92  if (off == 0x3c) {
93  return CharClass::kNukta;
94  }
95  if (off == 0x3d) {
96  return CharClass::kVowel; // avagraha
97  }
98  if (off <= 0x4c || (0x51 <= off && off <= 0x54)) {
99  return CharClass::kMatra;
100  }
101  if (0x55 <= off && off <= 0x57) {
102  return CharClass::kMatraPiece;
103  }
104  if (off == 0x4d) {
105  return CharClass::kVirama;
106  }
107  }
108  if (off == 0x60 || off == 0x61) {
109  return CharClass::kVowel;
110  }
111  if (off == 0x62 || off == 0x63) {
112  return CharClass::kMatra;
113  }
114  // Danda and digits up to 6f are OK as other.
115  // 70-7f are script-specific.
116  // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
117  if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) {
118  return CharClass::kOther;
119  }
120  // 0BF3-0BFA are other Tamil symbols.
121  if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) {
122  return CharClass::kOther;
123  }
124  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) {
125  return CharClass::kConsonant;
126  }
127  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) {
128  return CharClass::kConsonant;
129  }
130  if (script_ == ViramaScript::kSinhala && off == 0x70) {
131  return CharClass::kConsonant;
132  }
133  if (script_ == ViramaScript::kDevanagari && off == 0x70) {
134  return CharClass::kOther;
135  }
136  if (0x70 <= off && off <= 0x73) {
138  }
139  // Non Indic, Digits, Measures, danda, etc.
140  return CharClass::kOther;
141 }
142 
143 // Helper consumes/copies a virama and any associated post-virama joiners.
144 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
145 // no joiner at all) must be followed by a consonant.
146 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
147 // consonant, space, or character from a different script. We clean up the
148 // representation to make it consistent by adding a ZWNJ if missing from a
149 // non-linking virama. Returns false with an invalid sequence.
150 bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
151  const unsigned num_codes = codes_.size();
152  if (joiner.first == CharClass::kOther) {
154  if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {
155  // Post-matra viramas must be explicit, so no joiners allowed here.
156  if (post_matra) {
157  if (report_errors_) {
158  tprintf("ZWJ after a post-matra virama!!\n");
159  }
160  return false;
161  }
162  if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kRayana &&
163  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
164  codes_[codes_used_ + 1].second == kYayana ||
165  codes_[codes_used_ + 1].second == kRayana)) {
166  // This combination will be picked up later.
168  } else {
169  // Half-form with optional Nukta.
170  unsigned len = output_.size() + 1 - output_used_;
171  if (UseMultiCode(len)) {
172  return true;
173  }
174  }
175  if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {
176  if (output_used_ == output_.size() || output_[output_used_] != kRayana) {
177  if (report_errors_) {
178  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_));
179  }
180  return false;
181  }
182  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
183  if (UseMultiCode(4)) {
184  return true;
185  }
186  }
187  } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||
188  post_matra) {
189  if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {
190  // It is valid to have an unterminated virama at the end of a word, but
191  // for consistency, we will always add ZWNJ if not present.
192  output_.push_back(kZeroWidthNonJoiner);
193  } else {
195  }
196  // Explicit virama [H z]
197  MultiCodePart(2);
198  }
199  } else {
200  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
201  if (UseMultiCode(2)) {
202  if (report_errors_) {
203  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
204  }
205  return false;
206  }
207  if (codes_[codes_used_].second == kZeroWidthJoiner ||
209  if (report_errors_) {
210  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
211  codes_[codes_used_].second);
212  }
213  return false;
214  }
215  }
216  // It is good so far as it goes.
217  return true;
218 }
219 
220 // Helper consumes/copies a series of consonants separated by viramas while
221 // valid, but not any vowel or other modifiers.
222 bool ValidateIndic::ConsumeConsonantHeadIfValid() {
223  const unsigned num_codes = codes_.size();
224  // Consonant aksara
225  do {
227  // Special Sinhala case of [H Z Yayana/Rayana].
228  int index = output_.size() - 3;
229  if (output_used_ + 3 <= output_.size() &&
230  (output_.back() == kYayana || output_.back() == kRayana) && IsVirama(output_[index]) &&
231  output_[index + 1] == kZeroWidthJoiner) {
232  MultiCodePart(3);
233  }
234  bool have_nukta = false;
235  if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {
236  have_nukta = true;
238  }
239  // Test for subscript conjunct.
240  index = output_.size() - 2 - have_nukta;
241  if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
242  IsVirama(output_[index])) {
243  // Output previous virama, consonant + optional nukta.
244  MultiCodePart(2 + have_nukta);
245  }
246  IndicPair joiner(CharClass::kOther, 0);
247  if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||
248  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
250  joiner = codes_[codes_used_];
251  if (++codes_used_ == num_codes) {
252  if (report_errors_) {
253  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second);
254  }
255  return true;
256  }
257  if (codes_[codes_used_].first == CharClass::kVirama) {
258  output_.push_back(joiner.second);
259  } else {
260  if (report_errors_) {
261  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second,
262  codes_[codes_used_].second);
263  }
264  joiner = std::make_pair(CharClass::kOther, 0);
265  }
266  }
267  if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {
268  if (!ConsumeViramaIfValid(joiner, false)) {
269  return false;
270  }
271  } else {
272  break; // No virama, so the run of consonants is over.
273  }
274  } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);
275  if (output_used_ < output_.size()) {
276  MultiCodePart(1);
277  }
278  return true;
279 }
280 
281 // Helper consumes/copies a tail part of a consonant, comprising optional
282 // matra/piece, vowel modifier, vedic mark, terminating virama.
283 bool ValidateIndic::ConsumeConsonantTailIfValid() {
284  if (codes_used_ == codes_.size()) {
285  return true;
286  }
287  // No virama: Finish the grapheme.
288  // Are multiple matras allowed?
289  if (codes_[codes_used_].first == CharClass::kMatra) {
290  if (UseMultiCode(1)) {
291  return true;
292  }
293  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
294  if (UseMultiCode(1)) {
295  return true;
296  }
297  }
298  }
299  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
300  if (UseMultiCode(1)) {
301  return true;
302  }
303  // Only Malayalam allows only repeated 0xd02.
304  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {
305  break;
306  }
307  }
308  while (codes_[codes_used_].first == CharClass::kVedicMark) {
309  if (UseMultiCode(1)) {
310  return true;
311  }
312  }
313  if (codes_[codes_used_].first == CharClass::kVirama) {
314  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
315  return false;
316  }
317  }
318  // What we have consumed so far is a valid consonant cluster.
319  if (output_used_ < output_.size()) {
320  MultiCodePart(1);
321  }
322 
323  return true;
324 }
325 
326 // Helper consumes/copies a vowel and optional modifiers.
327 bool ValidateIndic::ConsumeVowelIfValid() {
328  if (UseMultiCode(1)) {
329  return true;
330  }
331  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
332  if (UseMultiCode(1)) {
333  return true;
334  }
335  // Only Malayalam allows repeated modifiers?
337  break;
338  }
339  }
340  while (codes_[codes_used_].first == CharClass::kVedicMark) {
341  if (UseMultiCode(1)) {
342  return true;
343  }
344  }
345  // What we have consumed so far is a valid vowel cluster.
346  return true;
347 }
348 
349 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
std::vector< char32 > output_
Definition: validator.h:229
unsigned output_used_
Definition: validator.h:233
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
void MultiCodePart(unsigned length)
Definition: validator.h:176
static bool IsVirama(char32 unicode)
Definition: validator.cpp:169
static const int kIndicCodePageSize
Definition: validator.h:207
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:135
bool IsSubscriptScript() const
Definition: validator.cpp:184
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98