tesseract  5.0.0
rune.c File Reference
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"

Go to the source code of this file.

Enumerations

enum  {
  Bit1 = 7 , Bitx = 6 , Bit2 = 5 , Bit3 = 4 ,
  Bit4 = 3 , Bit5 = 2 , T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF , Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF ,
  T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF , T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF , T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF , T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF ,
  Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1 , Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1 , Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1 , Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1 ,
  Maskx = (1 << Bitx) - 1 , Testx = Maskx ^ 0xFF , Bad = Runeerror
}
 

Functions

int charntorune (Rune *rune, const char *str, int length)
 
int chartorune (Rune *rune, const char *str)
 
int isvalidcharntorune (const char *str, int length, Rune *rune, int *consumed)
 
int runetochar (char *str, const Rune *rune)
 
int runelen (Rune rune)
 
int runenlen (const Rune *r, int nrune)
 
int fullrune (const char *str, int n)
 

Enumeration Type Documentation

◆ anonymous enum

anonymous enum
Enumerator
Bit1 
Bitx 
Bit2 
Bit3 
Bit4 
Bit5 
T1 
Tx 
T2 
T3 
T4 
T5 
Rune1 
Rune2 
Rune3 
Rune4 
Maskx 
Testx 
Bad 

Definition at line 19 of file rune.c.

19  {
20  Bit1 = 7,
21  Bitx = 6,
22  Bit2 = 5,
23  Bit3 = 4,
24  Bit4 = 3,
25  Bit5 = 2,
26 
27  T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */
28  Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */
29  T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */
30  T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */
31  T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */
32  T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */
33 
34  Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */
35  Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */
36  Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */
37  Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1,
38  /* 0001 1111 1111 1111 1111 1111 */
39 
40  Maskx = (1 << Bitx) - 1, /* 0011 1111 */
41  Testx = Maskx ^ 0xFF, /* 1100 0000 */
42 
43  Bad = Runeerror,
44 };
@ T4
Definition: rune.c:31
@ Testx
Definition: rune.c:41
@ T3
Definition: rune.c:30
@ T1
Definition: rune.c:27
@ Rune4
Definition: rune.c:37
@ Maskx
Definition: rune.c:40
@ Rune3
Definition: rune.c:36
@ Bit4
Definition: rune.c:24
@ T5
Definition: rune.c:32
@ Bit5
Definition: rune.c:25
@ T2
Definition: rune.c:29
@ Bit2
Definition: rune.c:22
@ Rune2
Definition: rune.c:35
@ Rune1
Definition: rune.c:34
@ Tx
Definition: rune.c:28
@ Bit3
Definition: rune.c:23
@ Bad
Definition: rune.c:43
@ Bitx
Definition: rune.c:21
@ Bit1
Definition: rune.c:20
@ Runeerror
Definition: utf.h:25

Function Documentation

◆ charntorune()

int charntorune ( Rune rune,
const char *  str,
int  length 
)

Definition at line 64 of file rune.c.

64  {
65  int c, c1, c2, c3;
66  long l;
67 
68  /* When we're not allowed to read anything */
69  if (length <= 0) {
70  goto badlen;
71  }
72 
73  /*
74  * one character sequence (7-bit value)
75  * 00000-0007F => T1
76  */
77  c = *(uchar *)str;
78  if (c < Tx) {
79  *rune = c;
80  return 1;
81  }
82 
83  // If we can't read more than one character we must stop
84  if (length <= 1) {
85  goto badlen;
86  }
87 
88  /*
89  * two character sequence (11-bit value)
90  * 0080-07FF => T2 Tx
91  */
92  c1 = *(uchar *)(str + 1) ^ Tx;
93  if (c1 & Testx)
94  goto bad;
95  if (c < T3) {
96  if (c < T2)
97  goto bad;
98  l = ((c << Bitx) | c1) & Rune2;
99  if (l <= Rune1)
100  goto bad;
101  *rune = l;
102  return 2;
103  }
104 
105  // If we can't read more than two characters we must stop
106  if (length <= 2) {
107  goto badlen;
108  }
109 
110  /*
111  * three character sequence (16-bit value)
112  * 0800-FFFF => T3 Tx Tx
113  */
114  c2 = *(uchar *)(str + 2) ^ Tx;
115  if (c2 & Testx)
116  goto bad;
117  if (c < T4) {
118  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
119  if (l <= Rune2)
120  goto bad;
121  *rune = l;
122  return 3;
123  }
124 
125  if (length <= 3)
126  goto badlen;
127 
128  /*
129  * four character sequence (21-bit value)
130  * 10000-1FFFFF => T4 Tx Tx Tx
131  */
132  c3 = *(uchar *)(str + 3) ^ Tx;
133  if (c3 & Testx)
134  goto bad;
135  if (c < T5) {
136  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
137  if (l <= Rune3)
138  goto bad;
139  if (l > Runemax)
140  goto bad;
141  *rune = l;
142  return 4;
143  }
144 
145  // Support for 5-byte or longer UTF-8 would go here, but
146  // since we don't have that, we'll just fall through to bad.
147 
148  /*
149  * bad decoding
150  */
151 bad:
152  *rune = Bad;
153  return 1;
154 badlen:
155  *rune = Bad;
156  return 0;
157 }
@ Runemax
Definition: utf.h:26
unsigned char uchar
Definition: utfdef.h:8

◆ chartorune()

int chartorune ( Rune rune,
const char *  str 
)

Definition at line 163 of file rune.c.

163  {
164  int c, c1, c2, c3;
165  long l;
166 
167  /*
168  * one character sequence
169  * 00000-0007F => T1
170  */
171  c = *(uchar *)str;
172  if (c < Tx) {
173  *rune = c;
174  return 1;
175  }
176 
177  /*
178  * two character sequence
179  * 0080-07FF => T2 Tx
180  */
181  c1 = *(uchar *)(str + 1) ^ Tx;
182  if (c1 & Testx)
183  goto bad;
184  if (c < T3) {
185  if (c < T2)
186  goto bad;
187  l = ((c << Bitx) | c1) & Rune2;
188  if (l <= Rune1)
189  goto bad;
190  *rune = l;
191  return 2;
192  }
193 
194  /*
195  * three character sequence
196  * 0800-FFFF => T3 Tx Tx
197  */
198  c2 = *(uchar *)(str + 2) ^ Tx;
199  if (c2 & Testx)
200  goto bad;
201  if (c < T4) {
202  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
203  if (l <= Rune2)
204  goto bad;
205  *rune = l;
206  return 3;
207  }
208 
209  /*
210  * four character sequence (21-bit value)
211  * 10000-1FFFFF => T4 Tx Tx Tx
212  */
213  c3 = *(uchar *)(str + 3) ^ Tx;
214  if (c3 & Testx)
215  goto bad;
216  if (c < T5) {
217  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
218  if (l <= Rune3)
219  goto bad;
220  if (l > Runemax)
221  goto bad;
222  *rune = l;
223  return 4;
224  }
225 
226  /*
227  * Support for 5-byte or longer UTF-8 would go here, but
228  * since we don't have that, we'll just fall through to bad.
229  */
230 
231  /*
232  * bad decoding
233  */
234 bad:
235  *rune = Bad;
236  return 1;
237 }

◆ fullrune()

int fullrune ( const char *  str,
int  n 
)

Definition at line 326 of file rune.c.

326  {
327  if (n > 0) {
328  int c = *(uchar *)str;
329  if (c < Tx)
330  return 1;
331  if (n > 1) {
332  if (c < T3)
333  return 1;
334  if (n > 2) {
335  if (c < T4 || n > 3)
336  return 1;
337  }
338  }
339  }
340  return 0;
341 }

◆ isvalidcharntorune()

int isvalidcharntorune ( const char *  str,
int  length,
Rune rune,
int *  consumed 
)

Definition at line 239 of file rune.c.

239  {
240  *consumed = charntorune(rune, str, length);
241  return *rune != Runeerror || *consumed == 3;
242 }
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64

◆ runelen()

int runelen ( Rune  rune)

Definition at line 299 of file rune.c.

299  {
300  char str[10];
301 
302  return runetochar(str, &rune);
303 }
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244

◆ runenlen()

int runenlen ( const Rune r,
int  nrune 
)

Definition at line 305 of file rune.c.

305  {
306  int nb;
307  ulong c; /* Rune is signed, so use unsigned for range check. */
308 
309  nb = 0;
310  while (nrune--) {
311  c = *r++;
312  if (c <= Rune1)
313  nb++;
314  else if (c <= Rune2)
315  nb += 2;
316  else if (c <= Rune3)
317  nb += 3;
318  else if (c <= Runemax)
319  nb += 4;
320  else
321  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
322  }
323  return nb;
324 }
unsigned long ulong
Definition: utfdef.h:11

◆ runetochar()

int runetochar ( char *  str,
const Rune rune 
)

Definition at line 244 of file rune.c.

244  {
245  /* Runes are signed, so convert to unsigned for range check. */
246  unsigned long c;
247 
248  /*
249  * one character sequence
250  * 00000-0007F => 00-7F
251  */
252  c = *rune;
253  if (c <= Rune1) {
254  str[0] = c;
255  return 1;
256  }
257 
258  /*
259  * two character sequence
260  * 0080-07FF => T2 Tx
261  */
262  if (c <= Rune2) {
263  str[0] = T2 | (c >> 1 * Bitx);
264  str[1] = Tx | (c & Maskx);
265  return 2;
266  }
267 
268  /*
269  * If the Rune is out of range, convert it to the error rune.
270  * Do this test here because the error rune encodes to three bytes.
271  * Doing it earlier would duplicate work, since an out of range
272  * Rune wouldn't have fit in one or two bytes.
273  */
274  if (c > Runemax)
275  c = Runeerror;
276 
277  /*
278  * three character sequence
279  * 0800-FFFF => T3 Tx Tx
280  */
281  if (c <= Rune3) {
282  str[0] = T3 | (c >> 2 * Bitx);
283  str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
284  str[2] = Tx | (c & Maskx);
285  return 3;
286  }
287 
288  /*
289  * four character sequence (21-bit value)
290  * 10000-1FFFFF => T4 Tx Tx Tx
291  */
292  str[0] = T4 | (c >> 3 * Bitx);
293  str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
294  str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
295  str[3] = Tx | (c & Maskx);
296  return 4;
297 }