tesseract  5.0.0
tesseract::LigatureTable Class Reference

#include <ligature_table.h>

Public Member Functions

std::string AddLigatures (const std::string &str, const PangoFontInfo *font) const
 
std::string RemoveLigatures (const std::string &str) const
 
std::string RemoveCustomLigatures (const std::string &str) const
 
const LigHashnorm_to_lig_table () const
 
const LigHashlig_to_norm_table () const
 

Static Public Member Functions

static LigatureTableGet ()
 

Protected Member Functions

 LigatureTable ()
 
void Init ()
 

Protected Attributes

LigHash norm_to_lig_table_
 
LigHash lig_to_norm_table_
 
int min_lig_length_
 
int max_lig_length_
 
int min_norm_length_
 
int max_norm_length_
 

Static Protected Attributes

static std::unique_ptr< LigatureTableinstance_
 

Detailed Description

Definition at line 38 of file ligature_table.h.

Constructor & Destructor Documentation

◆ LigatureTable()

tesseract::LigatureTable::LigatureTable ( )
protected

Member Function Documentation

◆ AddLigatures()

std::string tesseract::LigatureTable::AddLigatures ( const std::string &  str,
const PangoFontInfo font 
) const

Definition at line 153 of file ligature_table.cpp.

153  {
154  std::string result;
155  int len = str.size();
156  int step = 0;
157  int i = 0;
158  for (i = 0; i < len - min_norm_length_ + 1; i += step) {
159  step = 0;
160  for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
161  if (i + liglen <= len) {
162  std::string lig_cand = str.substr(i, liglen);
163  auto it = norm_to_lig_table_.find(lig_cand);
164  if (it != norm_to_lig_table_.end()) {
165  tlog(3, "Considering %s -> %s\n", lig_cand.c_str(), it->second.c_str());
166  if (font) {
167  // Test for renderability.
168  if (!font->CanRenderString(it->second.data(), it->second.length())) {
169  continue; // Not renderable
170  }
171  }
172  // Found a match so convert it.
173  step = liglen;
174  result += it->second;
175  tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(), it->second.c_str());
176  break;
177  }
178  }
179  }
180  if (step == 0) {
181  result += str[i];
182  step = 1;
183  }
184  }
185  result += str.substr(i, len - i);
186  return result;
187 }
#define tlog(level,...)
Definition: tlog.h:36

◆ Get()

LigatureTable * tesseract::LigatureTable::Get ( )
static

Definition at line 51 of file ligature_table.cpp.

51  {
52  if (instance_ == nullptr) {
53  instance_.reset(new LigatureTable());
54  instance_->Init();
55  }
56  return instance_.get();
57 }
static std::unique_ptr< LigatureTable > instance_

◆ Init()

void tesseract::LigatureTable::Init ( )
protected

Definition at line 62 of file ligature_table.cpp.

62  {
63  if (norm_to_lig_table_.empty()) {
64  for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
65  // For each char in the range, convert to utf8, nfc normalize, and if
66  // the strings are different put the both mappings in the hash_maps.
67  std::string lig8 = EncodeAsUTF8(lig);
68  icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
69  icu::UnicodeString normed8_result;
70  icu::ErrorCode status;
71  icu::Normalizer::normalize(unicode_lig8, UNORM_NFC, 0, normed8_result, status);
72  std::string normed8;
73  normed8_result.toUTF8String(normed8);
74  int lig_length = lig8.length();
75  int norm_length = normed8.size();
76  if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
77  norm_to_lig_table_[normed8] = lig8;
78  lig_to_norm_table_[lig8] = normed8;
79  if (min_lig_length_ == 0 || lig_length < min_lig_length_) {
80  min_lig_length_ = lig_length;
81  }
82  if (lig_length > max_lig_length_) {
83  max_lig_length_ = lig_length;
84  }
85  if (min_norm_length_ == 0 || norm_length < min_norm_length_) {
86  min_norm_length_ = norm_length;
87  }
88  if (norm_length > max_norm_length_) {
89  max_norm_length_ = norm_length;
90  }
91  }
92  }
93  // Add custom extra ligatures.
94  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
96  int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
97  if (min_norm_length_ == 0 || norm_length < min_norm_length_) {
98  min_norm_length_ = norm_length;
99  }
100  if (norm_length > max_norm_length_) {
101  max_norm_length_ = norm_length;
102  }
103 
105  }
106  }
107 }
signed int char32
const int kMinLigature
const int kMaxLigature
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169

◆ lig_to_norm_table()

const LigHash& tesseract::LigatureTable::lig_to_norm_table ( ) const
inline

Definition at line 55 of file ligature_table.h.

55  {
56  return lig_to_norm_table_;
57  }

◆ norm_to_lig_table()

const LigHash& tesseract::LigatureTable::norm_to_lig_table ( ) const
inline

Definition at line 52 of file ligature_table.h.

52  {
53  return norm_to_lig_table_;
54  }

◆ RemoveCustomLigatures()

std::string tesseract::LigatureTable::RemoveCustomLigatures ( const std::string &  str) const

Definition at line 128 of file ligature_table.cpp.

128  {
129  std::string result;
130  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
131  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
132  char tmp[5];
133  int len;
134  int norm_ind;
135  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
136  len = it.get_utf8(tmp);
137  tmp[len] = '\0';
138  norm_ind = -1;
139  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {
140  if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
141  norm_ind = i;
142  }
143  }
144  if (norm_ind >= 0) {
145  result += UNICHARSET::kCustomLigatures[norm_ind][0];
146  } else {
147  result += tmp;
148  }
149  }
150  return result;
151 }
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213

◆ RemoveLigatures()

std::string tesseract::LigatureTable::RemoveLigatures ( const std::string &  str) const

Definition at line 109 of file ligature_table.cpp.

109  {
110  std::string result;
111  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
112  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
113  char tmp[5];
114  int len;
115  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
116  len = it.get_utf8(tmp);
117  tmp[len] = '\0';
118  auto lig_it = lig_to_norm_table_.find(tmp);
119  if (lig_it != lig_to_norm_table_.end()) {
120  result += lig_it->second;
121  } else {
122  result += tmp;
123  }
124  }
125  return result;
126 }

Member Data Documentation

◆ instance_

std::unique_ptr< LigatureTable > tesseract::LigatureTable::instance_
staticprotected

Definition at line 65 of file ligature_table.h.

◆ lig_to_norm_table_

LigHash tesseract::LigatureTable::lig_to_norm_table_
protected

Definition at line 67 of file ligature_table.h.

◆ max_lig_length_

int tesseract::LigatureTable::max_lig_length_
protected

Definition at line 69 of file ligature_table.h.

◆ max_norm_length_

int tesseract::LigatureTable::max_norm_length_
protected

Definition at line 71 of file ligature_table.h.

◆ min_lig_length_

int tesseract::LigatureTable::min_lig_length_
protected

Definition at line 68 of file ligature_table.h.

◆ min_norm_length_

int tesseract::LigatureTable::min_norm_length_
protected

Definition at line 70 of file ligature_table.h.

◆ norm_to_lig_table_

LigHash tesseract::LigatureTable::norm_to_lig_table_
protected

Definition at line 66 of file ligature_table.h.


The documentation for this class was generated from the following files: