tesseract  5.0.0
unicharset_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "unicharset.h"
13 #include <string>
14 #include "gmock/gmock.h" // for testing::ElementsAreArray
15 #include "include_gunit.h"
16 #include "log.h" // for LOG
17 
18 using testing::ElementsAreArray;
19 
20 namespace tesseract {
21 
22 class UnicharsetTest : public ::testing::Test {
23 protected:
24  void SetUp() override {
25  std::locale::global(std::locale(""));
26  }
27 };
28 
29 TEST(UnicharsetTest, Basics) {
30  // This test verifies basic insertion, unichar_to_id, and encode.
31  UNICHARSET u;
32  u.unichar_insert("a");
33  EXPECT_EQ(u.size(), 4);
34  u.unichar_insert("f");
35  EXPECT_EQ(u.size(), 5);
36  u.unichar_insert("i");
37  EXPECT_EQ(u.size(), 6);
38  // The fi ligature is NOT added because it can be encoded with a cleanup as f
39  // then i.
40  u.unichar_insert("\ufb01");
41  EXPECT_EQ(u.size(), 6);
42  u.unichar_insert("e");
43  EXPECT_EQ(u.size(), 7);
44  u.unichar_insert("n");
45  EXPECT_EQ(u.size(), 8);
46  EXPECT_EQ(u.unichar_to_id("f"), 4);
47  EXPECT_EQ(u.unichar_to_id("i"), 5);
48  // The fi ligature has no valid id.
49  EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID);
50  // The fi pair has no valid id.
51  EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID);
52  std::vector<int> labels;
53  EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr));
54  std::vector<int> v(&labels[0], &labels[0] + labels.size());
55  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
56  // With the fi ligature encoding fails without a pre-cleanup.
57  std::string lig_str = "af\ufb01ne";
58  EXPECT_FALSE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
59  lig_str = u.CleanupString(lig_str.c_str());
60  EXPECT_TRUE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
61  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
62  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
63 }
64 
65 TEST(UnicharsetTest, Multibyte) {
66  // This test verifies basic insertion, unichar_to_id, and encode.
67  // The difference from Basic above is that now we are testing multi-byte
68  // unicodes instead of single byte.
69  UNICHARSET u;
70  // Insert some Arabic letters.
71  u.unichar_insert("\u0627");
72  EXPECT_EQ(u.size(), 4);
73  u.unichar_insert("\u062c");
74  EXPECT_EQ(u.size(), 5);
75  u.unichar_insert("\u062f");
76  EXPECT_EQ(u.size(), 6);
77  u.unichar_insert("\ufb01"); // fi ligature is added as fi pair.
78  EXPECT_EQ(u.size(), 7);
79  u.unichar_insert("\u062b");
80  EXPECT_EQ(u.size(), 8);
81  u.unichar_insert("\u0635");
82  EXPECT_EQ(u.size(), 9);
83  EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
84  EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
85  // The first two bytes of this string is \u0627, which matches id 3;
86  EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
87  EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
88  // Individual f and i are not present, but they are there as a pair.
89  EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID);
90  EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID);
91  EXPECT_EQ(u.unichar_to_id("fi"), 6);
92  // The fi ligature is findable.
93  EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
94  std::vector<int> labels;
95  EXPECT_TRUE(
96  u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, &labels, nullptr, nullptr));
97  std::vector<int> v(&labels[0], &labels[0] + labels.size());
98  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
99  // With the fi ligature the fi is picked out.
100  std::vector<char> lengths;
101  unsigned encoded_length;
102  std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
103  // src_str has to be pre-cleaned for lengths to be correct.
104  std::string cleaned = u.CleanupString(src_str.c_str());
105  EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, &encoded_length));
106  EXPECT_EQ(encoded_length, cleaned.size());
107  std::string len_str(&lengths[0], lengths.size());
108  EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
109  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
110  EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
111 }
112 
113 TEST(UnicharsetTest, MultibyteBigrams) {
114  // This test verifies basic insertion, unichar_to_id, and encode.
115  // The difference from Basic above is that now we are testing multi-byte
116  // unicodes instead of single byte.
117  UNICHARSET u;
118  // Insert some Arabic letters.
119  u.unichar_insert("\u0c9c");
120  EXPECT_EQ(u.size(), 4);
121  u.unichar_insert("\u0cad");
122  EXPECT_EQ(u.size(), 5);
123  u.unichar_insert("\u0ccd\u0c9c");
124  EXPECT_EQ(u.size(), 6);
125  u.unichar_insert("\u0ccd");
126  EXPECT_EQ(u.size(), 7);
127  // By default the encodable bigram is NOT added.
128  u.unichar_insert("\u0ccd\u0cad");
129  EXPECT_EQ(u.size(), 7);
130  // It is added if we force it to be.
131  u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue);
132  EXPECT_EQ(u.size(), 8);
133  std::vector<char> data;
134  tesseract::TFile fp;
135  fp.OpenWrite(&data);
136  u.save_to_file(&fp);
137  fp.Open(&data[0], data.size());
138  UNICHARSET v;
139  v.load_from_file(&fp, false);
140  EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3);
141  EXPECT_EQ(v.unichar_to_id("\u0cad"), 4);
142  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5);
143  EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6);
144  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7);
145 }
146 
147 TEST(UnicharsetTest, OldStyle) {
148  // This test verifies an old unicharset that contains fi/fl ligatures loads
149  // and keeps all the entries.
150  std::string filename = file::JoinPath(TESTDATA_DIR, "eng.unicharset");
151  UNICHARSET u;
152  LOG(INFO) << "Filename=" << filename;
153  EXPECT_TRUE(u.load_from_file(filename.c_str()));
154  EXPECT_EQ(u.size(), 111);
155 }
156 
157 } // namespace tesseract
@ LOG
@ INFO
Definition: log.h:28
TEST(TesseractInstanceTest, TestMultipleTessInstances)
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65