tesseract  5.0.0
tesseract::UNICHAR Class Reference

#include <unichar.h>

Classes

class  const_iterator
 

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 
static const_iterator begin (const char *utf8_str, int byte_length)
 
static const_iterator end (const char *utf8_str, int byte_length)
 
static std::vector< char32UTF8ToUTF32 (const char *utf8_str)
 
static std::string UTF32ToUTF8 (const std::vector< char32 > &str32)
 

Detailed Description

Definition at line 57 of file unichar.h.

Constructor & Destructor Documentation

◆ UNICHAR() [1/3]

tesseract::UNICHAR::UNICHAR ( )
inline

Definition at line 59 of file unichar.h.

59  {
60  memset(chars, 0, UNICHAR_LEN);
61  }
#define UNICHAR_LEN
Definition: unichar.h:33

◆ UNICHAR() [2/3]

tesseract::UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 31 of file unichar.cpp.

31  {
32  int total_len = 0;
33  int step = 0;
34  if (len < 0) {
35  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36  ;
37  }
38  }
39  for (total_len = 0; total_len < len; total_len += step) {
40  step = utf8_step(utf8_str + total_len);
41  if (total_len + step > UNICHAR_LEN) {
42  break; // Too long.
43  }
44  if (step == 0) {
45  break; // Illegal first byte.
46  }
47  int i;
48  for (i = 1; i < step; ++i) {
49  if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50  break;
51  }
52  }
53  if (i < step) {
54  break; // Illegal surrogate
55  }
56  }
57  memcpy(chars, utf8_str, total_len);
58  if (total_len < UNICHAR_LEN) {
59  chars[UNICHAR_LEN - 1] = total_len;
60  while (total_len < UNICHAR_LEN - 1) {
61  chars[total_len++] = 0;
62  }
63  }
64 }
char * utf8_str() const
Definition: unichar.cpp:134
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143

◆ UNICHAR() [3/3]

tesseract::UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 68 of file unichar.cpp.

68  {
69  const int bytemask = 0xBF;
70  const int bytemark = 0x80;
71 
72  if (unicode < 0x80) {
73  chars[UNICHAR_LEN - 1] = 1;
74  chars[2] = 0;
75  chars[1] = 0;
76  chars[0] = static_cast<char>(unicode);
77  } else if (unicode < 0x800) {
78  chars[UNICHAR_LEN - 1] = 2;
79  chars[2] = 0;
80  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81  unicode >>= 6;
82  chars[0] = static_cast<char>(unicode | 0xc0);
83  } else if (unicode < 0x10000) {
84  chars[UNICHAR_LEN - 1] = 3;
85  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
86  unicode >>= 6;
87  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
88  unicode >>= 6;
89  chars[0] = static_cast<char>(unicode | 0xe0);
90  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91  chars[UNICHAR_LEN - 1] = 4;
92  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
93  unicode >>= 6;
94  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
95  unicode >>= 6;
96  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
97  unicode >>= 6;
98  chars[0] = static_cast<char>(unicode | 0xf0);
99  } else {
100  memset(chars, 0, UNICHAR_LEN);
101  }
102 }
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:23

Member Function Documentation

◆ begin()

UNICHAR::const_iterator tesseract::UNICHAR::begin ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 209 of file unichar.cpp.

209  {
210  return UNICHAR::const_iterator(utf8_str);
211 }

◆ end()

UNICHAR::const_iterator tesseract::UNICHAR::end ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 213 of file unichar.cpp.

213  {
214  return UNICHAR::const_iterator(utf8_str + len);
215 }

◆ first_uni()

int tesseract::UNICHAR::first_uni ( ) const

Definition at line 105 of file unichar.cpp.

105  {
106  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107  int uni = 0;
108  int len = utf8_step(chars);
109  const char *src = chars;
110 
111  switch (len) {
112  default:
113  break;
114  case 4:
115  uni += static_cast<unsigned char>(*src++);
116  uni <<= 6;
117  // Fall through.
118  case 3:
119  uni += static_cast<unsigned char>(*src++);
120  uni <<= 6;
121  // Fall through.
122  case 2:
123  uni += static_cast<unsigned char>(*src++);
124  uni <<= 6;
125  // Fall through.
126  case 1:
127  uni += static_cast<unsigned char>(*src++);
128  }
129  uni -= utf8_offsets[len];
130  return uni;
131 }

◆ UTF32ToUTF8()

std::string tesseract::UNICHAR::UTF32ToUTF8 ( const std::vector< char32 > &  str32)
static

Definition at line 237 of file unichar.cpp.

237  {
238  std::string utf8_str;
239  for (char32 ch : str32) {
240  UNICHAR uni_ch(ch);
241  int step;
242  if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243  utf8_str.append(uni_ch.utf8(), step);
244  } else {
245  return "";
246  }
247  }
248  return utf8_str;
249 }
signed int char32

◆ utf8()

const char* tesseract::UNICHAR::utf8 ( ) const
inline

Definition at line 83 of file unichar.h.

83  {
84  return chars;
85  }

◆ utf8_len()

int tesseract::UNICHAR::utf8_len ( ) const
inline

Definition at line 77 of file unichar.h.

77  {
78  int len = chars[UNICHAR_LEN - 1];
79  return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
80  }

◆ utf8_step()

int tesseract::UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 143 of file unichar.cpp.

143  {
144  static const char utf8_bytes[256] = {
145  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154 
155  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156 }

◆ utf8_str()

char * tesseract::UNICHAR::utf8_str ( ) const

Definition at line 134 of file unichar.cpp.

134  {
135  int len = utf8_len();
136  char *str = new char[len + 1];
137  memcpy(str, chars, len);
138  str[len] = 0;
139  return str;
140 }
int utf8_len() const
Definition: unichar.h:77

◆ UTF8ToUTF32()

std::vector< char32 > tesseract::UNICHAR::UTF8ToUTF32 ( const char *  utf8_str)
static

Definition at line 220 of file unichar.cpp.

220  {
221  const int utf8_length = strlen(utf8_str);
222  std::vector<char32> unicodes;
223  unicodes.reserve(utf8_length);
224  const_iterator end_it(end(utf8_str, utf8_length));
225  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226  if (it.is_legal()) {
227  unicodes.push_back(*it);
228  } else {
229  unicodes.clear();
230  return unicodes;
231  }
232  }
233  return unicodes;
234 }
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213

The documentation for this class was generated from the following files: