tesseract  5.0.0
rune.c
Go to the documentation of this file.
1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  * Copyright (c) 2002 by Lucent Technologies.
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose without fee is hereby granted, provided that this entire notice
6  * is included in all copies of any software which is or includes a copy
7  * or modification of this software and in all copies of the supporting
8  * documentation for such software.
9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10  * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13  */
14 #include <stdarg.h>
15 #include <string.h>
16 #include "third_party/utf/utf.h"
17 #include "third_party/utf/utfdef.h"
18 
19 enum {
20  Bit1 = 7,
21  Bitx = 6,
22  Bit2 = 5,
23  Bit3 = 4,
24  Bit4 = 3,
25  Bit5 = 2,
26 
27  T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */
28  Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */
29  T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */
30  T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */
31  T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */
32  T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */
33 
34  Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */
35  Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */
36  Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */
37  Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1,
38  /* 0001 1111 1111 1111 1111 1111 */
39 
40  Maskx = (1 << Bitx) - 1, /* 0011 1111 */
41  Testx = Maskx ^ 0xFF, /* 1100 0000 */
42 
44 };
45 
46 /*
47  * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
48  * This is a slower but "safe" version of the old chartorune
49  * that works on strings that are not necessarily null-terminated.
50  *
51  * If you know for sure that your string is null-terminated,
52  * chartorune will be a bit faster.
53  *
54  * It is guaranteed not to attempt to access "length"
55  * past the incoming pointer. This is to avoid
56  * possible access violations. If the string appears to be
57  * well-formed but incomplete (i.e., to get the whole Rune
58  * we'd need to read past str+length) then we'll set the Rune
59  * to Bad and return 0.
60  *
61  * Note that if we have decoding problems for other
62  * reasons, we return 1 instead of 0.
63  */
64 int charntorune(Rune *rune, const char *str, int length) {
65  int c, c1, c2, c3;
66  long l;
67 
68  /* When we're not allowed to read anything */
69  if (length <= 0) {
70  goto badlen;
71  }
72 
73  /*
74  * one character sequence (7-bit value)
75  * 00000-0007F => T1
76  */
77  c = *(uchar *)str;
78  if (c < Tx) {
79  *rune = c;
80  return 1;
81  }
82 
83  // If we can't read more than one character we must stop
84  if (length <= 1) {
85  goto badlen;
86  }
87 
88  /*
89  * two character sequence (11-bit value)
90  * 0080-07FF => T2 Tx
91  */
92  c1 = *(uchar *)(str + 1) ^ Tx;
93  if (c1 & Testx)
94  goto bad;
95  if (c < T3) {
96  if (c < T2)
97  goto bad;
98  l = ((c << Bitx) | c1) & Rune2;
99  if (l <= Rune1)
100  goto bad;
101  *rune = l;
102  return 2;
103  }
104 
105  // If we can't read more than two characters we must stop
106  if (length <= 2) {
107  goto badlen;
108  }
109 
110  /*
111  * three character sequence (16-bit value)
112  * 0800-FFFF => T3 Tx Tx
113  */
114  c2 = *(uchar *)(str + 2) ^ Tx;
115  if (c2 & Testx)
116  goto bad;
117  if (c < T4) {
118  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
119  if (l <= Rune2)
120  goto bad;
121  *rune = l;
122  return 3;
123  }
124 
125  if (length <= 3)
126  goto badlen;
127 
128  /*
129  * four character sequence (21-bit value)
130  * 10000-1FFFFF => T4 Tx Tx Tx
131  */
132  c3 = *(uchar *)(str + 3) ^ Tx;
133  if (c3 & Testx)
134  goto bad;
135  if (c < T5) {
136  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
137  if (l <= Rune3)
138  goto bad;
139  if (l > Runemax)
140  goto bad;
141  *rune = l;
142  return 4;
143  }
144 
145  // Support for 5-byte or longer UTF-8 would go here, but
146  // since we don't have that, we'll just fall through to bad.
147 
148  /*
149  * bad decoding
150  */
151 bad:
152  *rune = Bad;
153  return 1;
154 badlen:
155  *rune = Bad;
156  return 0;
157 }
158 
159 /*
160  * This is the older "unsafe" version, which works fine on
161  * null-terminated strings.
162  */
163 int chartorune(Rune *rune, const char *str) {
164  int c, c1, c2, c3;
165  long l;
166 
167  /*
168  * one character sequence
169  * 00000-0007F => T1
170  */
171  c = *(uchar *)str;
172  if (c < Tx) {
173  *rune = c;
174  return 1;
175  }
176 
177  /*
178  * two character sequence
179  * 0080-07FF => T2 Tx
180  */
181  c1 = *(uchar *)(str + 1) ^ Tx;
182  if (c1 & Testx)
183  goto bad;
184  if (c < T3) {
185  if (c < T2)
186  goto bad;
187  l = ((c << Bitx) | c1) & Rune2;
188  if (l <= Rune1)
189  goto bad;
190  *rune = l;
191  return 2;
192  }
193 
194  /*
195  * three character sequence
196  * 0800-FFFF => T3 Tx Tx
197  */
198  c2 = *(uchar *)(str + 2) ^ Tx;
199  if (c2 & Testx)
200  goto bad;
201  if (c < T4) {
202  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
203  if (l <= Rune2)
204  goto bad;
205  *rune = l;
206  return 3;
207  }
208 
209  /*
210  * four character sequence (21-bit value)
211  * 10000-1FFFFF => T4 Tx Tx Tx
212  */
213  c3 = *(uchar *)(str + 3) ^ Tx;
214  if (c3 & Testx)
215  goto bad;
216  if (c < T5) {
217  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
218  if (l <= Rune3)
219  goto bad;
220  if (l > Runemax)
221  goto bad;
222  *rune = l;
223  return 4;
224  }
225 
226  /*
227  * Support for 5-byte or longer UTF-8 would go here, but
228  * since we don't have that, we'll just fall through to bad.
229  */
230 
231  /*
232  * bad decoding
233  */
234 bad:
235  *rune = Bad;
236  return 1;
237 }
238 
239 int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed) {
240  *consumed = charntorune(rune, str, length);
241  return *rune != Runeerror || *consumed == 3;
242 }
243 
244 int runetochar(char *str, const Rune *rune) {
245  /* Runes are signed, so convert to unsigned for range check. */
246  unsigned long c;
247 
248  /*
249  * one character sequence
250  * 00000-0007F => 00-7F
251  */
252  c = *rune;
253  if (c <= Rune1) {
254  str[0] = c;
255  return 1;
256  }
257 
258  /*
259  * two character sequence
260  * 0080-07FF => T2 Tx
261  */
262  if (c <= Rune2) {
263  str[0] = T2 | (c >> 1 * Bitx);
264  str[1] = Tx | (c & Maskx);
265  return 2;
266  }
267 
268  /*
269  * If the Rune is out of range, convert it to the error rune.
270  * Do this test here because the error rune encodes to three bytes.
271  * Doing it earlier would duplicate work, since an out of range
272  * Rune wouldn't have fit in one or two bytes.
273  */
274  if (c > Runemax)
275  c = Runeerror;
276 
277  /*
278  * three character sequence
279  * 0800-FFFF => T3 Tx Tx
280  */
281  if (c <= Rune3) {
282  str[0] = T3 | (c >> 2 * Bitx);
283  str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
284  str[2] = Tx | (c & Maskx);
285  return 3;
286  }
287 
288  /*
289  * four character sequence (21-bit value)
290  * 10000-1FFFFF => T4 Tx Tx Tx
291  */
292  str[0] = T4 | (c >> 3 * Bitx);
293  str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
294  str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
295  str[3] = Tx | (c & Maskx);
296  return 4;
297 }
298 
299 int runelen(Rune rune) {
300  char str[10];
301 
302  return runetochar(str, &rune);
303 }
304 
305 int runenlen(const Rune *r, int nrune) {
306  int nb;
307  ulong c; /* Rune is signed, so use unsigned for range check. */
308 
309  nb = 0;
310  while (nrune--) {
311  c = *r++;
312  if (c <= Rune1)
313  nb++;
314  else if (c <= Rune2)
315  nb += 2;
316  else if (c <= Rune3)
317  nb += 3;
318  else if (c <= Runemax)
319  nb += 4;
320  else
321  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
322  }
323  return nb;
324 }
325 
326 int fullrune(const char *str, int n) {
327  if (n > 0) {
328  int c = *(uchar *)str;
329  if (c < Tx)
330  return 1;
331  if (n > 1) {
332  if (c < T3)
333  return 1;
334  if (n > 2) {
335  if (c < T4 || n > 3)
336  return 1;
337  }
338  }
339  }
340  return 0;
341 }
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64
int chartorune(Rune *rune, const char *str)
Definition: rune.c:163
int fullrune(const char *str, int n)
Definition: rune.c:326
@ T4
Definition: rune.c:31
@ Testx
Definition: rune.c:41
@ T3
Definition: rune.c:30
@ T1
Definition: rune.c:27
@ Rune4
Definition: rune.c:37
@ Maskx
Definition: rune.c:40
@ Rune3
Definition: rune.c:36
@ Bit4
Definition: rune.c:24
@ T5
Definition: rune.c:32
@ Bit5
Definition: rune.c:25
@ T2
Definition: rune.c:29
@ Bit2
Definition: rune.c:22
@ Rune2
Definition: rune.c:35
@ Rune1
Definition: rune.c:34
@ Tx
Definition: rune.c:28
@ Bit3
Definition: rune.c:23
@ Bad
Definition: rune.c:43
@ Bitx
Definition: rune.c:21
@ Bit1
Definition: rune.c:20
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:239
int runelen(Rune rune)
Definition: rune.c:299
int runenlen(const Rune *r, int nrune)
Definition: rune.c:305
signed int Rune
Definition: utf.h:19
@ Runemax
Definition: utf.h:26
@ Runeerror
Definition: utf.h:25
unsigned char uchar
Definition: utfdef.h:8
unsigned long ulong
Definition: utfdef.h:11