tesseract  5.0.0
unicodetext.h
Go to the documentation of this file.
1 
17 #ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
18 #define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
19 
20 #include <stddef.h> // for NULL, ptrdiff_t
21 #include <iterator> // for bidirectional_iterator_tag, etc
22 #include <string> // for string
23 #include <utility> // for pair
24 
25 #include "syntaxnet/base.h"
26 
27 // ***************************** UnicodeText **************************
28 //
29 // A UnicodeText object is a container for a sequence of Unicode
30 // codepoint values. It has default, copy, and assignment constructors.
31 // Data can be appended to it from another UnicodeText, from
32 // iterators, or from a single codepoint.
33 //
34 // The internal representation of the text is UTF-8. Since UTF-8 is a
35 // variable-width format, UnicodeText does not provide random access
36 // to the text, and changes to the text are permitted only at the end.
37 //
38 // The UnicodeText class defines a const_iterator. The dereferencing
39 // operator (*) returns a codepoint (char32). The iterator is a
40 // bidirectional, read-only iterator. It becomes invalid if the text
41 // is changed.
42 //
43 // There are methods for appending and retrieving UTF-8 data directly.
44 // The 'utf8_data' method returns a const char* that contains the
45 // UTF-8-encoded version of the text; 'utf8_length' returns the number
46 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to
47 // 4 bytes of UTF-8 data in a char array and returns the number of
48 // bytes that it stored.
49 //
50 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
51 // 0x10FFFF], but UnicodeText has the additional restriction that it
52 // can contain only those characters that are valid for interchange on
53 // the Web. This excludes all of the control codes except for carriage
54 // return, line feed, and horizontal tab. It also excludes
55 // non-characters, but codepoints that are in the Private Use regions
56 // are allowed, as are codepoints that are unassigned. (See the
57 // Unicode reference for details.) The function UniLib::IsInterchangeValid
58 // can be used as a test for this property.
59 //
60 // UnicodeTexts are safe. Every method that constructs or modifies a
61 // UnicodeText tests for interchange-validity, and will substitute a
62 // space for the invalid data. Such cases are reported via
63 // LOG(WARNING).
64 //
65 // MEMORY MANAGEMENT: copy, take ownership, or point to
66 //
67 // A UnicodeText is either an "owner", meaning that it owns the memory
68 // for the data buffer and will free it when the UnicodeText is
69 // destroyed, or it is an "alias", meaning that it does not.
70 //
71 // There are three methods for storing UTF-8 data in a UnicodeText:
72 //
73 // CopyUTF8(buffer, len) copies buffer.
74 //
75 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
76 //
77 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
78 //
79 // All three methods perform a validity check on the buffer. There are
80 // private, "unsafe" versions of these functions that bypass the
81 // validity check. They are used internally and by friend-functions
82 // that are handling UTF-8 data that has already been validated.
83 //
84 // The purpose of an alias is to avoid making an unnecessary copy of a
85 // UTF-8 buffer while still providing access to the Unicode values
86 // within that text through iterators or the fast scanners that are
87 // based on UTF-8 state tables. The lifetime of an alias must not
88 // exceed the lifetime of the buffer from which it was constructed.
89 //
90 // The semantics of an alias might be described as "copy on write or
91 // repair." The source data is never modified. If push_back() or
92 // append() is called on an alias, a copy of the data will be created,
93 // and the UnicodeText will become an owner. If clear() is called on
94 // an alias, it becomes an (empty) owner.
95 //
96 // The copy constructor and the assignment operator produce an owner.
97 // That is, after direct initialization ("UnicodeText x(y);") or copy
98 // initialization ("UnicodeText x = y;") x will be an owner, even if y
99 // was an alias. The assignment operator ("x = y;") also produces an
100 // owner unless x and y are the same object and y is an alias.
101 //
102 // Aliases should be used with care. If the source from which an alias
103 // was created is freed, or if the contents are changed, while the
104 // alias is still in use, fatal errors could result. But it can be
105 // quite useful to have a UnicodeText "window" through which to see a
106 // UTF-8 buffer without having to pay the price of making a copy.
107 //
108 // UTILITIES
109 //
110 // The interfaces in util/utf8/public/textutils.h provide higher-level
111 // utilities for dealing with UnicodeTexts, including routines for
112 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
113 // strings, creating strings from UnicodeTexts, normalizing text for
114 // efficient matching or display, and others.
115 
116 class UnicodeText {
117 public:
118  class const_iterator;
119 
121 
122  // Constructors. These always produce owners.
123  UnicodeText(); // Create an empty text.
124  UnicodeText(const UnicodeText &src); // copy constructor
125  // Construct a substring (copies the data).
126  UnicodeText(const const_iterator &first, const const_iterator &last);
127 
128  // Assignment operator. This copies the data and produces an owner
129  // unless this == &src, e.g., "x = x;", which is a no-op.
130  UnicodeText &operator=(const UnicodeText &src);
131 
132  // x.Copy(y) copies the data from y into x.
133  UnicodeText &Copy(const UnicodeText &src);
134  inline UnicodeText &assign(const UnicodeText &src) {
135  return Copy(src);
136  }
137 
138  // x.PointTo(y) changes x so that it points to y's data.
139  // It does not copy y or take ownership of y's data.
140  UnicodeText &PointTo(const UnicodeText &src);
141  UnicodeText &PointTo(const const_iterator &first, const const_iterator &last);
142 
143  ~UnicodeText();
144 
145  void clear(); // Clear text.
146  bool empty() const {
147  return repr_.size_ == 0;
148  } // Test if text is empty.
149 
150  // Add a codepoint to the end of the text.
151  // If the codepoint is not interchange-valid, add a space instead
152  // and log a warning.
153  void push_back(char32 codepoint);
154 
155  // Generic appending operation.
156  // iterator_traits<ForwardIterator>::value_type must be implicitly
157  // convertible to char32. Typical uses of this method might include:
158  // char32 chars[] = {0x1, 0x2, ...};
159  // vector<char32> more_chars = ...;
160  // utext.append(chars, chars+arraysize(chars));
161  // utext.append(more_chars.begin(), more_chars.end());
162  template <typename ForwardIterator>
163  UnicodeText &append(ForwardIterator first, const ForwardIterator last) {
164  while (first != last) {
165  push_back(*first++);
166  }
167  return *this;
168  }
169 
170  // A specialization of the generic append() method.
171  UnicodeText &append(const const_iterator &first, const const_iterator &last);
172 
173  // An optimization of append(source.begin(), source.end()).
174  UnicodeText &append(const UnicodeText &source);
175 
176  int size() const; // the number of Unicode characters (codepoints)
177 
178  friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
179  friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs);
180 
182  typedef const_iterator CI;
183 
184  public:
185  typedef std::bidirectional_iterator_tag iterator_category;
187  typedef ptrdiff_t difference_type;
188  typedef void pointer; // (Not needed.)
189  typedef const char32 reference; // (Needed for const_reverse_iterator)
190 
191  // Iterators are default-constructible.
192  const_iterator();
193 
194  // It's safe to make multiple passes over a UnicodeText.
195  const_iterator(const const_iterator &other);
196  const_iterator &operator=(const const_iterator &other);
197 
198  char32 operator*() const; // Dereference
199 
200  const_iterator &operator++(); // Advance (++iter)
201  const_iterator operator++(int) { // (iter++)
202  const_iterator result(*this);
203  ++*this;
204  return result;
205  }
206 
207  const_iterator &operator--(); // Retreat (--iter)
208  const_iterator operator--(int) { // (iter--)
209  const_iterator result(*this);
210  --*this;
211  return result;
212  }
213 
214  // We love relational operators.
215  friend bool operator==(const CI &lhs, const CI &rhs) {
216  return lhs.it_ == rhs.it_;
217  }
218  friend bool operator!=(const CI &lhs, const CI &rhs) {
219  return !(lhs == rhs);
220  }
221  friend bool operator<(const CI &lhs, const CI &rhs);
222  friend bool operator>(const CI &lhs, const CI &rhs) {
223  return rhs < lhs;
224  }
225  friend bool operator<=(const CI &lhs, const CI &rhs) {
226  return !(rhs < lhs);
227  }
228  friend bool operator>=(const CI &lhs, const CI &rhs) {
229  return !(lhs < rhs);
230  }
231 
232  friend difference_type distance(const CI &first, const CI &last);
233 
234  // UTF-8-specific methods
235  // Store the UTF-8 encoding of the current codepoint into buf,
236  // which must be at least 4 bytes long. Return the number of
237  // bytes written.
238  int get_utf8(char *buf) const;
239  // Return the UTF-8 character that the iterator points to.
240  string get_utf8_string() const;
241  // Return the byte length of the UTF-8 character the iterator points to.
242  int utf8_length() const;
243  // Return the iterator's pointer into the UTF-8 data.
244  const char *utf8_data() const {
245  return it_;
246  }
247 
248  string DebugString() const;
249 
250  private:
251  friend class UnicodeText;
252  friend class UnicodeTextUtils;
254  explicit const_iterator(const char *it) : it_(it) {}
255 
256  const char *it_;
257  };
258 
259  const_iterator begin() const;
260  const_iterator end() const;
261 
262  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
263  public:
265  : std::reverse_iterator<const_iterator>(it) {}
266  const char *utf8_data() const {
267  const_iterator tmp_it = base();
268  return (--tmp_it).utf8_data();
269  }
270  int get_utf8(char *buf) const {
271  const_iterator tmp_it = base();
272  return (--tmp_it).get_utf8(buf);
273  }
274  string get_utf8_string() const {
275  const_iterator tmp_it = base();
276  return (--tmp_it).get_utf8_string();
277  }
278  int utf8_length() const {
279  const_iterator tmp_it = base();
280  return (--tmp_it).utf8_length();
281  }
282  };
284  return const_reverse_iterator(end());
285  }
287  return const_reverse_iterator(begin());
288  }
289 
290  // Substring searching. Returns the beginning of the first
291  // occurrence of "look", or end() if not found.
292  const_iterator find(const UnicodeText &look, const_iterator start_pos) const;
293  // Equivalent to find(look, begin())
294  const_iterator find(const UnicodeText &look) const;
295 
296  // Returns whether this contains the character U+FFFD. This can
297  // occur, for example, if the input to Encodings::Decode() had byte
298  // sequences that were invalid in the source encoding.
299  bool HasReplacementChar() const;
300 
301  // UTF-8-specific methods
302  //
303  // Return the data, length, and capacity of UTF-8-encoded version of
304  // the text. Length and capacity are measured in bytes.
305  const char *utf8_data() const {
306  return repr_.data_;
307  }
308  int utf8_length() const {
309  return repr_.size_;
310  }
311  int utf8_capacity() const {
312  return repr_.capacity_;
313  }
314 
315  // Return the UTF-8 data as a string.
316  static string UTF8Substring(const const_iterator &first, const const_iterator &last);
317 
318  // There are three methods for initializing a UnicodeText from UTF-8
319  // data. They vary in details of memory management. In all cases,
320  // the data is tested for interchange-validity. If it is not
321  // interchange-valid, a LOG(WARNING) is issued, and each
322  // structurally invalid byte and each interchange-invalid codepoint
323  // is replaced with a space.
324 
325  // x.CopyUTF8(buf, len) copies buf into x.
326  UnicodeText &CopyUTF8(const char *utf8_buffer, int byte_length);
327 
328  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
329  // buf. buf is not copied.
330  UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
331 
332  // x.PointToUTF8(buf,len) changes x so that it points to buf
333  // ("becomes an alias"). It does not take ownership or copy buf.
334  // If the buffer is not valid, this has the same effect as
335  // CopyUTF8(utf8_buffer, byte_length).
336  UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);
337 
338  // Occasionally it is necessary to use functions that operate on the
339  // pointer returned by utf8_data(). MakeIterator(p) provides a way
340  // to get back to the UnicodeText level. It uses CHECK to ensure
341  // that p is a pointer within this object's UTF-8 data, and that it
342  // points to the beginning of a character.
343  const_iterator MakeIterator(const char *p) const;
344 
345  string DebugString() const;
346 
347 private:
348  friend class const_iterator;
349  friend class UnicodeTextUtils;
350 
351  class Repr { // A byte-string.
352  public:
353  char *data_;
354  int size_;
355  int capacity_;
356  bool ours_; // Do we own data_?
357 
358  Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
359  ~Repr() {
360  if (ours_)
361  delete[] data_;
362  }
363 
364  void clear();
365  void reserve(int capacity);
366  void resize(int size);
367 
368  void append(const char *bytes, int byte_length);
369  void Copy(const char *data, int size);
370  void TakeOwnershipOf(char *data, int size, int capacity);
371  void PointTo(const char *data, int size);
372 
373  string DebugString() const;
374 
375  private:
376  Repr &operator=(const Repr &);
377  Repr(const Repr &other);
378  };
379 
380  Repr repr_;
381 
382  // UTF-8-specific private methods.
383  // These routines do not perform a validity check when compiled
384  // in opt mode.
385  // It is an error to call these methods with UTF-8 data that
386  // is not interchange-valid.
387  //
388  UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length);
389  UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
390  UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length);
391  UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length);
392  const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const;
393 };
394 
395 bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
396 
397 inline bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs) {
398  return !(lhs == rhs);
399 }
400 
401 // UnicodeTextRange is a pair of iterators, useful for specifying text
402 // segments. If the iterators are ==, the segment is empty.
403 typedef pair<UnicodeText::const_iterator, UnicodeText::const_iterator> UnicodeTextRange;
404 
406  return r.first == r.second;
407 }
408 
409 // *************************** Utilities *************************
410 
411 // A factory function for creating a UnicodeText from a buffer of
412 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
413 // is an "owner.")
414 //
415 // Each byte that is structurally invalid will be replaced with a
416 // space. Each codepoint that is interchange-invalid will also be
417 // replaced with a space, even if the codepoint was represented with a
418 // multibyte sequence in the UTF-8 data.
419 //
420 inline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length,
421  int byte_capacity) {
422  return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity);
423 }
424 
425 // A factory function for creating a UnicodeText from a buffer of
426 // UTF-8 data. The new UnicodeText does not take ownership of the
427 // buffer. (It is an "alias.")
428 //
430  int byte_length) {
431  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
432 }
433 
434 // Create a UnicodeText from a UTF-8 string or buffer.
435 //
436 // If do_copy is true, then a copy of the string is made. The copy is
437 // owned by the resulting UnicodeText object and will be freed when
438 // the object is destroyed. This UnicodeText object is referred to
439 // as an "owner."
440 //
441 // If do_copy is false, then no copy is made. The resulting
442 // UnicodeText object does NOT take ownership of the string; in this
443 // case, the lifetime of the UnicodeText object must not exceed the
444 // lifetime of the string. This Unicodetext object is referred to as
445 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
446 //
447 // If the input string does not contain valid UTF-8, then a copy is
448 // made (as if do_copy were true) and coerced to valid UTF-8 by
449 // replacing each invalid byte with a space.
450 //
451 inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) {
452  UnicodeText t;
453  if (do_copy) {
454  t.CopyUTF8(utf8_buf, len);
455  } else {
456  t.PointToUTF8(utf8_buf, len);
457  }
458  return t;
459 }
460 
461 inline UnicodeText UTF8ToUnicodeText(const string &utf_string, bool do_copy) {
462  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
463 }
464 
465 inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len) {
466  return UTF8ToUnicodeText(utf8_buf, len, true);
467 }
468 inline UnicodeText UTF8ToUnicodeText(const string &utf8_string) {
469  return UTF8ToUnicodeText(utf8_string, true);
470 }
471 
472 // Return a string containing the UTF-8 encoded version of all the
473 // Unicode characters in t.
474 inline string UnicodeTextToUTF8(const UnicodeText &t) {
475  return string(t.utf8_data(), t.utf8_length());
476 }
477 
478 // This template function declaration is used in defining arraysize.
479 // Note that the function doesn't need an implementation, as we only
480 // use its type.
481 template <typename T, size_t N>
482 char (&ArraySizeHelper(T (&array)[N]))[N];
483 #define arraysize(array) (sizeof(ArraySizeHelper(array)))
484 
485 // For debugging. Return a string of integers, written in uppercase
486 // hex (%X), corresponding to the codepoints within the text. Each
487 // integer is followed by a space. E.g., "61 62 6A 3005 ".
488 string CodepointString(const UnicodeText &t);
489 
490 #endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
signed int char32
char(& ArraySizeHelper(T(&array)[N]))[N]
UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy)
Definition: unicodetext.h:451
pair< UnicodeText::const_iterator, UnicodeText::const_iterator > UnicodeTextRange
Definition: unicodetext.h:403
string CodepointString(const UnicodeText &t)
UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.h:420
UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer, int byte_length)
Definition: unicodetext.h:429
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
string UnicodeTextToUTF8(const UnicodeText &t)
Definition: unicodetext.h:474
bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r)
Definition: unicodetext.h:405
bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.h:397
LIST last(LIST var_list)
Definition: oldlist.cpp:153
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:202
void push_back(char32 codepoint)
Definition: unicodetext.cc:357
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:221
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:301
char32 value_type
Definition: unicodetext.h:118
friend class UnicodeTextUtils
Definition: unicodetext.h:349
const char * utf8_data() const
Definition: unicodetext.h:305
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216
const_reverse_iterator rend() const
Definition: unicodetext.h:286
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:270
bool empty() const
Definition: unicodetext.h:146
string DebugString() const
int utf8_capacity() const
Definition: unicodetext.h:311
UnicodeText & assign(const UnicodeText &src)
Definition: unicodetext.h:134
friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:163
const_iterator end() const
Definition: unicodetext.cc:412
friend class const_iterator
Definition: unicodetext.h:348
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:209
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:237
int utf8_length() const
Definition: unicodetext.h:308
int size() const
Definition: unicodetext.cc:373
bool HasReplacementChar() const
const_iterator begin() const
Definition: unicodetext.cc:408
friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.h:397
const_reverse_iterator rbegin() const
Definition: unicodetext.h:283
void clear()
Definition: unicodetext.cc:350
friend bool operator>(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:222
friend bool operator<=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:225
const_iterator operator++(int)
Definition: unicodetext.h:201
friend class UTF8StateTableProperty
Definition: unicodetext.h:253
const_iterator operator--(int)
Definition: unicodetext.h:208
friend difference_type distance(const CI &first, const CI &last)
Definition: unicodetext.cc:44
std::bidirectional_iterator_tag iterator_category
Definition: unicodetext.h:185
const char * utf8_data() const
Definition: unicodetext.h:244
const_iterator & operator++()
Definition: unicodetext.cc:443
const_iterator & operator--()
Definition: unicodetext.cc:448
int get_utf8(char *buf) const
Definition: unicodetext.cc:454
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402
friend bool operator>=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:228
string get_utf8_string() const
Definition: unicodetext.cc:468
friend bool operator!=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:218
friend bool operator<(const CI &lhs, const CI &rhs)
Definition: unicodetext.cc:416
friend bool operator==(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:215
const_reverse_iterator(const_iterator it)
Definition: unicodetext.h:264
const char * utf8_data() const
Definition: unicodetext.h:266