tesseract  5.0.0
pango_font_info_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "pango_font_info.h"
13 #include <pango/pango.h>
14 #include <cstdio>
15 #include <string>
16 #include "commandlineflags.h"
17 #include "fileio.h"
18 #include "gmock/gmock-matchers.h" // for EXPECT_THAT
19 #include "include_gunit.h"
20 #ifdef INCLUDE_TENSORFLOW
21 # include "util/utf8/unicodetext.h" // for UnicodeText
22 #endif
23 
24 namespace tesseract {
25 
26 // Fonts in testdata directory
27 const char *kExpectedFontNames[] = {"Arab",
28  "Arial Bold Italic",
29  "DejaVu Sans Ultra-Light",
30  "Lohit Hindi",
31 #if PANGO_VERSION <= 12005
32  "Times New Roman",
33 #else
34  "Times New Roman,", // Pango v1.36.2 requires a trailing ','
35 #endif
36  "UnBatang",
37  "Verdana"};
38 
39 // Sample text used in tests.
40 const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
41 const char kEngText[] = "the quick brown fox jumps over the lazy dog";
42 const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
43 const char kKorText[] = "이는 것으로";
44 // Hindi words containing illegal vowel sequences.
45 const char *kBadlyFormedHinWords[] = {
46 #if PANGO_VERSION <= 12005
47  "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
48 #endif
49  // Pango v1.36.2 will render the above words even though they are invalid.
50  "प्रंात", nullptr};
51 
52 static PangoFontMap *font_map;
53 
54 class PangoFontInfoTest : public ::testing::Test {
55 protected:
56  void SetUp() override {
57  if (!font_map) {
58  font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
59  }
60  pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
61  }
62 
63  // Creates a fake fonts.conf file that points to the testdata fonts for
64  // fontconfig to initialize with.
65  static void SetUpTestCase() {
66  static std::locale system_locale("");
67  std::locale::global(system_locale);
68 
69  FLAGS_fonts_dir = TESTING_DIR;
70  FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
72  PangoFontInfo::SoftInitFontConfig(); // init early
73  }
74 
76 };
77 
78 TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {
79  PangoFontInfo font("Arial Bold Italic 12");
80  EXPECT_EQ(12, font.font_size());
81  EXPECT_EQ("Arial", font.family_name());
82 }
83 
84 TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {
85  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12"));
86  EXPECT_EQ(12, font_info_.font_size());
87  EXPECT_EQ("Arial", font_info_.family_name());
88 
89  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10"));
90  EXPECT_EQ(10, font_info_.font_size());
91  EXPECT_EQ("Verdana", font_info_.family_name());
92 
93  EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light"));
94  EXPECT_EQ("DejaVu Sans", font_info_.family_name());
95 }
96 
97 TEST_F(PangoFontInfoTest, DoesParseMissingFonts) {
98  // Font family one of whose faces exists but this one doesn't.
99  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
100  EXPECT_EQ(12, font_info_.font_size());
101  EXPECT_EQ("Arial", font_info_.family_name());
102 
103  // Font family that doesn't exist in testdata. It will still parse the
104  // description name. But without the file, it will not be able to populate
105  // some font family details, like is_monospace().
106  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10"));
107  EXPECT_EQ(10, font_info_.font_size());
108  EXPECT_EQ("Georgia", font_info_.family_name());
109 }
110 
111 TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {
112  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
113  int x_bearing, x_advance;
114  EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance));
115  EXPECT_GT(x_advance, 0);
116  EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance));
117  EXPECT_GT(x_advance, 0);
118 }
119 
120 TEST_F(PangoFontInfoTest, CanRenderString) {
121  font_info_.ParseFontDescriptionName("Verdana 12");
122  EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));
123 
124  font_info_.ParseFontDescriptionName("UnBatang 12");
125  EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
126 
127  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
128  EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));
129 }
130 
131 TEST_F(PangoFontInfoTest, CanRenderLigature) {
132  font_info_.ParseFontDescriptionName("Arab 12");
133  const char kArabicLigature[] = "لا";
134  EXPECT_TRUE(font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
135 
136  printf("Next word\n");
137  EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
138 }
139 
140 TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
141  font_info_.ParseFontDescriptionName("Verdana 12");
142  EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
143 }
144 
145 TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
146  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
147  for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
148  EXPECT_FALSE(
149  font_info_.CanRenderString(kBadlyFormedHinWords[i], strlen(kBadlyFormedHinWords[i])))
150  << "Can render " << kBadlyFormedHinWords[i];
151  }
152 }
153 
154 TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
155  font_info_.ParseFontDescriptionName("Verdana 12");
156  // Verdana cannot render the "ff" ligature
157  std::string word = "office";
158  EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
159  EXPECT_EQ("oice", word);
160 
161  // Don't drop non-letter characters like word joiners.
162  const char *kJoiners[] = {
163  "\u2060", // U+2060 (WJ)
164  "\u200C", // U+200C (ZWJ)
165  "\u200D" // U+200D (ZWNJ)
166  };
167  for (auto &kJoiner : kJoiners) {
168  word = kJoiner;
169  EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
170  EXPECT_STREQ(kJoiner, word.c_str());
171  }
172 }
173 
174 // ------------------------ FontUtils ------------------------------------
175 
176 class FontUtilsTest : public ::testing::Test {
177 protected:
178  void SetUp() override {
180  }
181  // Creates a fake fonts.conf file that points to the testdata fonts for
182  // fontconfig to initialize with.
183  static void SetUpTestCase() {
184  FLAGS_fonts_dir = TESTING_DIR;
185  FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
186  if (!font_map) {
187  font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
188  }
189  pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
190  }
191 
192 #ifdef INCLUDE_TENSORFLOW
193  void CountUnicodeChars(const char *utf8_text, std::unordered_map<char32, int64_t> *ch_map) {
194  ch_map->clear();
195  UnicodeText ut;
196  ut.PointToUTF8(utf8_text, strlen(utf8_text));
197  for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
198 # if 0
199  if (UnicodeProps::IsWhitespace(*it)) continue;
200 # else
201  if (std::isspace(*it))
202  continue;
203 # endif
204  ++(*ch_map)[*it];
205  }
206  }
207 #endif
208 };
209 
210 TEST_F(FontUtilsTest, DoesFindAvailableFonts) {
211  EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic"));
212  EXPECT_TRUE(FontUtils::IsAvailableFont("Verdana"));
213  EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light"));
214 
215  // Test that we can support font name convention for Pango v1.30.2 even when
216  // we are running an older version.
217  EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,"));
218 }
219 
220 TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
221  // Only bold italic face is available.
222  EXPECT_FALSE(FontUtils::IsAvailableFont("Arial"));
223  // Don't have a ttf for the Courier family.
224  EXPECT_FALSE(FontUtils::IsAvailableFont("Courier"));
225  // Pango "synthesizes" the italic font from the available Verdana Regular and
226  // includes it in its list, but it is not really loadable.
227  EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic"));
228  // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart.
229  EXPECT_FALSE(FontUtils::IsAvailableFont("DejaVu Sans"));
230 }
231 
232 TEST_F(FontUtilsTest, DoesListAvailableFonts) {
233  const std::vector<std::string> &fonts = FontUtils::ListAvailableFonts();
234  EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
235  for (auto &font : fonts) {
236  PangoFontInfo font_info;
237  EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
238  }
239 }
240 
241 #ifdef INCLUDE_TENSORFLOW
242 TEST_F(FontUtilsTest, DoesFindBestFonts) {
243  std::string fonts_list;
244  std::unordered_map<char32, int64_t> ch_map;
245  CountUnicodeChars(kEngText, &ch_map);
246  EXPECT_EQ(26, ch_map.size()); // 26 letters
247  std::vector<std::pair<const char *, std::vector<bool> > > font_flags;
248  std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
249  EXPECT_TRUE(best_list.size());
250  // All fonts except Lohit Hindi should render English text.
251  EXPECT_EQ(countof(kExpectedFontNames) - 1, font_flags.size());
252 
253  CountUnicodeChars(kKorText, &ch_map);
254  best_list = FontUtils::BestFonts(ch_map, &font_flags);
255  EXPECT_TRUE(best_list.size());
256  // Only UnBatang font family is able to render korean.
257  EXPECT_EQ(1, font_flags.size());
258  EXPECT_STREQ("UnBatang", font_flags[0].first);
259 }
260 #endif
261 
262 TEST_F(FontUtilsTest, DoesSelectFont) {
263  const char *kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};
264  const char *kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
265  for (int i = 0; kLangText[i] != nullptr; ++i) {
266  SCOPED_TRACE(kLangNames[i]);
267  std::vector<std::string> graphemes;
268  std::string selected_font;
269  EXPECT_TRUE(
270  FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes));
271  EXPECT_TRUE(selected_font.size());
272  EXPECT_TRUE(graphemes.size());
273  }
274 }
275 
276 TEST_F(FontUtilsTest, DoesFailToSelectFont) {
277  const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
278  std::vector<std::string> graphemes;
279  std::string selected_font;
280  EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font,
281  &graphemes));
282 }
283 
284 #if 0
285 // Needs fix. FontUtils::GetAllRenderableCharacters was removed
286 // because of deprecated pango_coverage_max.
287 TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
288  const int32_t kHindiChar = 0x0905;
289  const int32_t kArabicChar = 0x0623;
290  const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
291  const int32_t kOghamChar = 0x1680; // Ogham space mark
292  std::vector<bool> unicode_mask;
293  FontUtils::GetAllRenderableCharacters(&unicode_mask);
294  EXPECT_TRUE(unicode_mask['A']);
295  EXPECT_TRUE(unicode_mask['1']);
296  EXPECT_TRUE(unicode_mask[kHindiChar]);
297  EXPECT_TRUE(unicode_mask[kArabicChar]);
298  EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
299 # if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
300  EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
301 # endif
302  unicode_mask.clear();
303 
304  std::vector<std::string> selected_fonts;
305  selected_fonts.push_back("Lohit Hindi");
306  FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
307  EXPECT_TRUE(unicode_mask['1']);
308  EXPECT_TRUE(unicode_mask[kHindiChar]);
309  EXPECT_FALSE(unicode_mask['A']); // Lohit doesn't render English,
310  EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
311  EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
312  EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
313  unicode_mask.clear();
314 
315  // Check that none of the included fonts cover the Mongolian or Ogham space
316  // characters.
317  for (size_t f = 0; f < countof(kExpectedFontNames); ++f) {
318  std::string tracestring = "Testing " + kExpectedFontNames[f];
319  SCOPED_TRACE(tracestring);
320  FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
321 # if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
322  EXPECT_FALSE(unicode_mask[kOghamChar]);
323 # endif
324  EXPECT_FALSE(unicode_mask[kMongolianChar]);
325  unicode_mask.clear();
326  }
327 }
328 #endif
329 
330 } // namespace tesseract
const char kArabicText[]
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:228
const char kHinText[]
const char kEngText[]
const char kKorText[]
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:42
const char * kBadlyFormedHinWords[]
const char * kExpectedFontNames[]
TEST_F(EuroText, FastLatinOCR)
bool ParseFontDescriptionName(const std::string &name)
const std::string & family_name() const
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
static bool IsAvailableFont(const char *font_desc)
static std::string BestFonts(const std::unordered_map< char32, int64_t > &ch_map, std::vector< std::pair< const char *, std::vector< bool >>> *font_flag)
static const std::vector< std::string > & ListAvailableFonts()
static void MakeTmpdir()
Definition: include_gunit.h:38
const_iterator end() const
Definition: unicodetext.cc:412
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
const_iterator begin() const
Definition: unicodetext.cc:408