tesseract  5.0.0
validate_grapheme_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "normstrngs.h"
14 #include "normstrngs_test.h"
15 
16 namespace tesseract {
17 
18 TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
19  std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
20  std::vector<std::string> glyphs;
22  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
24  // It made 3 graphemes.
25  EXPECT_EQ(glyphs.size(), 3);
26  EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
27  EXPECT_EQ(glyphs[1], std::string("\u0c15"));
28  EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
29 }
30 
31 TEST(ValidateGraphemeTest, SingleConsonantOK) {
32  std::string str = "\u0cb9"; // HA
33  std::vector<std::string> glyphs;
35  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
37  EXPECT_EQ(glyphs.size(), 1);
38  EXPECT_EQ(glyphs[0], str);
39 }
40 
41 TEST(ValidateGraphemeTest, SimpleCV) {
42  std::string str = "\u0cb9\u0cbf"; // HA I
43  std::vector<std::string> glyphs;
45  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
47  EXPECT_EQ(glyphs.size(), 1);
48  EXPECT_EQ(glyphs[0], str);
49 }
50 
51 TEST(ValidateGraphemeTest, SubscriptConjunct) {
52  std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
53  std::vector<std::string> glyphs;
55  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
57  EXPECT_EQ(glyphs.size(), 1);
58  EXPECT_EQ(glyphs[0], str);
60  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
61  &glyphs))
63  EXPECT_EQ(glyphs.size(), 3);
64  EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
65 }
66 
67 TEST(ValidateGraphemeTest, HalfFormJoiner) {
68  std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
69  std::vector<std::string> glyphs;
71  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
73  EXPECT_EQ(glyphs.size(), 1);
74  EXPECT_EQ(glyphs[0], str);
76  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
77  &glyphs))
79  EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
80  EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
81 }
82 
83 TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
84  std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
85  std::vector<std::string> glyphs;
87  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
89  EXPECT_EQ(glyphs.size(), 1);
90  EXPECT_EQ(glyphs[0], str);
92  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
93  &glyphs))
95  EXPECT_EQ(glyphs.size(), 3);
96  EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
97 }
98 
99 TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
100  std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
101  std::vector<std::string> glyphs;
103  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
105  EXPECT_EQ(glyphs.size(), 1);
106  EXPECT_EQ(glyphs[0], str);
108  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
109  &glyphs))
111  EXPECT_EQ(glyphs.size(), 3);
112  EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
113  // Malaylam only, so not allowed in Telugu.
114  str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
116  GraphemeNormMode::kCombined, true, str.c_str(),
117  &glyphs))
119 }
120 
121 TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
122  std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
123  std::vector<std::string> glyphs;
125  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
127  EXPECT_EQ(glyphs.size(), 2);
128  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
130  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
131  &glyphs))
133  EXPECT_EQ(glyphs.size(), 3);
134  EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
135 }
136 
137 TEST(ValidateGraphemeTest, ThaiGraphemes) {
138  // This is a single grapheme unless in glyph split mode
139  std::string str = "\u0e14\u0e38\u0e4a";
140  std::vector<std::string> glyphs;
142  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
144  EXPECT_EQ(glyphs.size(), 1);
145  EXPECT_EQ(glyphs[0], str);
147  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
148  &glyphs))
150  EXPECT_EQ(glyphs.size(), 3);
151  EXPECT_EQ(glyphs[0], std::string("\u0e14"));
152 }
153 
154 TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
155  std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
156  std::vector<std::string> glyphs;
157  // Returns true, but the joiner is gone.
159  GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
161  EXPECT_EQ(glyphs.size(), 5);
162  EXPECT_EQ(glyphs[0], std::string("'"));
163  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
164  EXPECT_EQ(glyphs[2], std::string("\u0d23"));
165  EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
166  EXPECT_EQ(glyphs[4], std::string("'"));
167 }
168 
169 } // namespace tesseract
std::string PrintString32WithUnicodes(const std::string &str)
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179
TEST(TesseractInstanceTest, TestMultipleTessInstances)