tesseract  5.0.0
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License");
6 // you may not use this file except in compliance with the License.
7 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifdef HAVE_CONFIG_H
16 # include "config_auto.h"
17 #endif
18 
19 #include <cctype>
20 #include <climits> // for CHAR_BIT
21 #include <cmath>
22 #include <cstdarg>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstdio>
26 #include <cstring>
27 #include <limits> // for std::numeric_limits
28 
29 #include "scanutils.h"
30 
31 enum Flags {
32  FL_SPLAT = 0x01, // Drop the value, do not assign
33  FL_INV = 0x02, // Character-set with inverse
34  FL_WIDTH = 0x04, // Field width specified
35  FL_MINUS = 0x08, // Negative number
36 };
37 
38 enum Ranks {
39  RANK_CHAR = -2,
40  RANK_SHORT = -1,
41  RANK_INT = 0,
42  RANK_LONG = 1,
44  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
45 };
46 
47 const enum Ranks kMinRank = RANK_CHAR;
48 const enum Ranks kMaxRank = RANK_LONGLONG;
49 
50 const enum Ranks kIntMaxRank = RANK_LONGLONG;
51 const enum Ranks kSizeTRank = RANK_LONG;
52 const enum Ranks kPtrDiffRank = RANK_LONG;
53 
54 enum Bail {
55  BAIL_NONE = 0, // No error condition
56  BAIL_EOF, // Hit EOF
57  BAIL_ERR // Conversion mismatch
58 };
59 
60 // Helper functions ------------------------------------------------------------
61 inline size_t LongBit() {
62  return CHAR_BIT * sizeof(long);
63 }
64 
65 static inline int SkipSpace(FILE *s) {
66  int p;
67  while (isascii(p = fgetc(s)) && isspace(p)) {
68  ;
69  }
70  ungetc(p, s); // Make sure next char is available for reading
71  return p;
72 }
73 
74 static inline void SetBit(unsigned long *bitmap, unsigned int bit) {
75  bitmap[bit / LongBit()] |= 1UL << (bit % LongBit());
76 }
77 
78 static inline int TestBit(unsigned long *bitmap, unsigned int bit) {
79  return static_cast<int>(bitmap[bit / LongBit()] >> (bit % LongBit())) & 1;
80 }
81 
82 static inline int DigitValue(int ch, int base) {
83  if (ch >= '0' && ch <= '9') {
84  if (base >= 10 || ch <= '7') {
85  return ch - '0';
86  }
87  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
88  return ch - 'A' + 10;
89  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
90  return ch - 'a' + 10;
91  }
92  return -1;
93 }
94 
95 // IO (re-)implementations -----------------------------------------------------
96 static uintmax_t streamtoumax(FILE *s, int base) {
97  int minus = 0;
98  uintmax_t v = 0;
99  int d, c = 0;
100 
101  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
102  ;
103  }
104 
105  // Single optional + or -
106  if (c == '-' || c == '+') {
107  minus = (c == '-');
108  c = fgetc(s);
109  }
110 
111  // Assign correct base
112  if (base == 0) {
113  if (c == '0') {
114  c = fgetc(s);
115  if (c == 'x' || c == 'X') {
116  base = 16;
117  c = fgetc(s);
118  } else {
119  base = 8;
120  }
121  }
122  } else if (base == 16) {
123  if (c == '0') {
124  c = fgetc(s);
125  if (c == 'x' || c == 'X') {
126  c = fgetc(s);
127  }
128  }
129  }
130 
131  // Actual number parsing
132  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) {
133  v = v * base + d;
134  }
135 
136  ungetc(c, s);
137  return minus ? -v : v;
138 }
139 
140 static double streamtofloat(FILE *s) {
141  bool minus = false;
142  uint64_t v = 0;
143  int d, c;
144  uint64_t k = 1;
145  uint64_t w = 0;
146 
147  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
148  ;
149  }
150 
151  // Single optional + or -
152  if (c == '-' || c == '+') {
153  minus = (c == '-');
154  c = fgetc(s);
155  }
156 
157  // Actual number parsing
158  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
159  v = v * 10 + d;
160  }
161  if (c == '.') {
162  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
163  w = w * 10 + d;
164  k *= 10;
165  }
166  }
167  double f = v + static_cast<double>(w) / k;
168  if (c == 'e' || c == 'E') {
169  c = fgetc(s);
170  int expsign = 1;
171  if (c == '-' || c == '+') {
172  expsign = (c == '-') ? -1 : 1;
173  c = fgetc(s);
174  }
175  int exponent = 0;
176  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
177  exponent = exponent * 10 + d;
178  }
179  exponent *= expsign;
180  f *= pow(10.0, static_cast<double>(exponent));
181  }
182  ungetc(c, s);
183 
184  return minus ? -f : f;
185 }
186 
187 static int tvfscanf(FILE *stream, const char *format, va_list ap);
188 
189 int tfscanf(FILE *stream, const char *format, ...) {
190  va_list ap;
191  int rv;
192 
193  va_start(ap, format);
194  rv = tvfscanf(stream, format, ap);
195  va_end(ap);
196 
197  return rv;
198 }
199 
200 static int tvfscanf(FILE *stream, const char *format, va_list ap) {
201  const char *p = format;
202  char ch;
203  int q = 0;
204  uintmax_t val = 0;
205  int rank = RANK_INT; // Default rank
206  unsigned int width = UINT_MAX;
207  int base;
208  int flags = 0;
209  enum {
210  ST_NORMAL, // Ground state
211  ST_FLAGS, // Special flags
212  ST_WIDTH, // Field width
213  ST_MODIFIERS, // Length or conversion modifiers
214  ST_MATCH_INIT, // Initial state of %[ sequence
215  ST_MATCH, // Main state of %[ sequence
216  ST_MATCH_RANGE, // After - in a %[ sequence
217  } state = ST_NORMAL;
218  char *sarg = nullptr; // %s %c or %[ string argument
219  enum Bail bail = BAIL_NONE;
220  int converted = 0; // Successful conversions
221  unsigned long
222  matchmap[((1 << CHAR_BIT) + (CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))];
223  int matchinv = 0; // Is match map inverted?
224  unsigned char range_start = 0;
225  auto start_off = std::ftell(stream);
226 
227  // Skip leading spaces
228  SkipSpace(stream);
229 
230  while ((ch = *p++) && !bail) {
231  switch (state) {
232  case ST_NORMAL:
233  if (ch == '%') {
234  state = ST_FLAGS;
235  flags = 0;
236  rank = RANK_INT;
237  width = UINT_MAX;
238  } else if (isascii(ch) && isspace(ch)) {
239  SkipSpace(stream);
240  } else {
241  if (fgetc(stream) != ch) {
242  bail = BAIL_ERR; // Match failure
243  }
244  }
245  break;
246 
247  case ST_FLAGS:
248  if (ch == '*') {
249  flags |= FL_SPLAT;
250  } else if ('0' <= ch && ch <= '9') {
251  width = (ch - '0');
252  state = ST_WIDTH;
253  flags |= FL_WIDTH;
254  } else {
255  state = ST_MODIFIERS;
256  p--; // Process this character again
257  }
258  break;
259 
260  case ST_WIDTH:
261  if (ch >= '0' && ch <= '9') {
262  width = width * 10 + (ch - '0');
263  } else {
264  state = ST_MODIFIERS;
265  p--; // Process this character again
266  }
267  break;
268 
269  case ST_MODIFIERS:
270  switch (ch) {
271  // Length modifiers - nonterminal sequences
272  case 'h':
273  rank--; // Shorter rank
274  break;
275  case 'l':
276  rank++; // Longer rank
277  break;
278  case 'j':
279  rank = kIntMaxRank;
280  break;
281  case 'z':
282  rank = kSizeTRank;
283  break;
284  case 't':
285  rank = kPtrDiffRank;
286  break;
287  case 'L':
288  case 'q':
289  rank = RANK_LONGLONG; // long double/long long
290  break;
291 
292  default:
293  // Output modifiers - terminal sequences
294  state = ST_NORMAL; // Next state will be normal
295  if (rank < kMinRank) { // Canonicalize rank
296  rank = kMinRank;
297  } else if (rank > kMaxRank) {
298  rank = kMaxRank;
299  }
300 
301  switch (ch) {
302  case 'P': // Upper case pointer
303  case 'p': // Pointer
304  rank = RANK_PTR;
305  base = 0;
306  goto scan_int;
307 
308  case 'i': // Base-independent integer
309  base = 0;
310  goto scan_int;
311 
312  case 'd': // Decimal integer
313  base = 10;
314  goto scan_int;
315 
316  case 'o': // Octal integer
317  base = 8;
318  goto scan_int;
319 
320  case 'u': // Unsigned decimal integer
321  base = 10;
322  goto scan_int;
323 
324  case 'x': // Hexadecimal integer
325  case 'X':
326  base = 16;
327  goto scan_int;
328 
329  case 'n': // Number of characters consumed
330  val = std::ftell(stream) - start_off;
331  goto set_integer;
332 
333  scan_int:
334  q = SkipSpace(stream);
335  if (q <= 0) {
336  bail = BAIL_EOF;
337  break;
338  }
339  val = streamtoumax(stream, base);
340  // fall through
341 
342  set_integer:
343  if (!(flags & FL_SPLAT)) {
344  converted++;
345  switch (rank) {
346  case RANK_CHAR:
347  *va_arg(ap, unsigned char *) = static_cast<unsigned char>(val);
348  break;
349  case RANK_SHORT:
350  *va_arg(ap, unsigned short *) = static_cast<unsigned short>(val);
351  break;
352  case RANK_INT:
353  *va_arg(ap, unsigned int *) = static_cast<unsigned int>(val);
354  break;
355  case RANK_LONG:
356  *va_arg(ap, unsigned long *) = static_cast<unsigned long>(val);
357  break;
358  case RANK_LONGLONG:
359  *va_arg(ap, unsigned long long *) = static_cast<unsigned long long>(val);
360  break;
361  case RANK_PTR:
362  *va_arg(ap, void **) = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
363  break;
364  }
365  }
366  break;
367 
368  case 'f': // Preliminary float value parsing
369  case 'g':
370  case 'G':
371  case 'e':
372  case 'E':
373  q = SkipSpace(stream);
374  if (q <= 0) {
375  bail = BAIL_EOF;
376  break;
377  }
378 
379  {
380  double fval = streamtofloat(stream);
381  if (!(flags & FL_SPLAT)) {
382  if (rank == RANK_INT) {
383  *va_arg(ap, float *) = static_cast<float>(fval);
384  } else if (rank == RANK_LONG) {
385  *va_arg(ap, double *) = static_cast<double>(fval);
386  }
387  converted++;
388  }
389  }
390  break;
391 
392  case 'c': // Character
393  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
394  sarg = va_arg(ap, char *);
395  while (width--) {
396  if ((q = fgetc(stream)) <= 0) {
397  bail = BAIL_EOF;
398  break;
399  }
400  if (!(flags & FL_SPLAT)) {
401  *sarg++ = q;
402  converted++;
403  }
404  }
405  break;
406 
407  case 's': // String
408  {
409  if (!(flags & FL_SPLAT)) {
410  sarg = va_arg(ap, char *);
411  }
412  unsigned length = 0;
413  while (width--) {
414  q = fgetc(stream);
415  if ((isascii(q) && isspace(q)) || (q <= 0)) {
416  ungetc(q, stream);
417  break;
418  }
419  if (!(flags & FL_SPLAT)) {
420  sarg[length] = q;
421  }
422  length++;
423  }
424  if (length == 0) {
425  bail = BAIL_EOF;
426  } else if (!(flags & FL_SPLAT)) {
427  sarg[length] = '\0'; // Terminate output
428  converted++;
429  }
430  } break;
431 
432  case '[': // Character range
433  sarg = va_arg(ap, char *);
434  state = ST_MATCH_INIT;
435  matchinv = 0;
436  memset(matchmap, 0, sizeof matchmap);
437  break;
438 
439  case '%': // %% sequence
440  if (fgetc(stream) != '%') {
441  bail = BAIL_ERR;
442  }
443  break;
444 
445  default: // Anything else
446  bail = BAIL_ERR; // Unknown sequence
447  break;
448  }
449  }
450  break;
451 
452  case ST_MATCH_INIT: // Initial state for %[ match
453  if (ch == '^' && !(flags & FL_INV)) {
454  matchinv = 1;
455  } else {
456  SetBit(matchmap, static_cast<unsigned char>(ch));
457  state = ST_MATCH;
458  }
459  break;
460 
461  case ST_MATCH: // Main state for %[ match
462  if (ch == ']') {
463  goto match_run;
464  } else if (ch == '-') {
465  range_start = static_cast<unsigned char>(ch);
466  state = ST_MATCH_RANGE;
467  } else {
468  SetBit(matchmap, static_cast<unsigned char>(ch));
469  }
470  break;
471 
472  case ST_MATCH_RANGE: // %[ match after -
473  if (ch == ']') {
474  SetBit(matchmap, static_cast<unsigned char>('-'));
475  goto match_run;
476  } else {
477  int i;
478  for (i = range_start; i < (static_cast<unsigned char>(ch)); i++) {
479  SetBit(matchmap, i);
480  }
481  state = ST_MATCH;
482  }
483  break;
484 
485  match_run: // Match expression finished
486  char *oarg = sarg;
487  while (width) {
488  q = fgetc(stream);
489  auto qc = static_cast<unsigned char>(q);
490  if (q <= 0 || !(TestBit(matchmap, qc) ^ matchinv)) {
491  ungetc(q, stream);
492  break;
493  }
494  if (!(flags & FL_SPLAT)) {
495  *sarg = q;
496  }
497  sarg++;
498  }
499  if (oarg == sarg) {
500  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
501  } else if (!(flags & FL_SPLAT)) {
502  *sarg = '\0';
503  converted++;
504  }
505  break;
506  }
507  }
508 
509  if (bail == BAIL_EOF && !converted) {
510  converted = -1; // Return EOF (-1)
511  }
512 
513  return converted;
514 }
Ranks
Definition: scanutils.cpp:38
@ RANK_LONGLONG
Definition: scanutils.cpp:43
@ RANK_SHORT
Definition: scanutils.cpp:40
@ RANK_CHAR
Definition: scanutils.cpp:39
@ RANK_LONG
Definition: scanutils.cpp:42
@ RANK_INT
Definition: scanutils.cpp:41
@ RANK_PTR
Definition: scanutils.cpp:44
enum Ranks kIntMaxRank
Definition: scanutils.cpp:50
enum Ranks kSizeTRank
Definition: scanutils.cpp:51
enum Ranks kMinRank
Definition: scanutils.cpp:47
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:52
Bail
Definition: scanutils.cpp:54
@ BAIL_NONE
Definition: scanutils.cpp:55
@ BAIL_ERR
Definition: scanutils.cpp:57
@ BAIL_EOF
Definition: scanutils.cpp:56
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:189
Flags
Definition: scanutils.cpp:31
@ FL_SPLAT
Definition: scanutils.cpp:32
@ FL_MINUS
Definition: scanutils.cpp:35
@ FL_INV
Definition: scanutils.cpp:33
@ FL_WIDTH
Definition: scanutils.cpp:34
enum Ranks kMaxRank
Definition: scanutils.cpp:48
size_t LongBit()
Definition: scanutils.cpp:61