tesseract  5.0.0
unicharmap.h
Go to the documentation of this file.
1 // File: unicharmap.h
3 // Description: Unicode character/ligature to integer id class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_UNICHARMAP_H_
21 #define TESSERACT_CCUTIL_UNICHARMAP_H_
22 
23 #include <tesseract/unichar.h>
24 
25 namespace tesseract {
26 
27 // A UNICHARMAP stores unique unichars. Each of them is associated with one
28 // UNICHAR_ID.
30 public:
31  // Create an empty UNICHARMAP
32  UNICHARMAP();
33 
34  ~UNICHARMAP();
35 
36  // Insert the given unichar represention in the UNICHARMAP and associate it
37  // with the given id. The length of the representation MUST be non-zero.
38  void insert(const char *const unichar_repr, UNICHAR_ID id);
39 
40  // Return the id associated with the given unichar representation,
41  // this representation MUST exist within the UNICHARMAP. The first
42  // length characters (maximum) from unichar_repr are used. The length
43  // MUST be non-zero.
44  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
45 
46  // Return true if the given unichar representation is already present in the
47  // UNICHARMAP. The first length characters (maximum) from unichar_repr are
48  // used. The length MUST be non-zero.
49  bool contains(const char *const unichar_repr, int length) const;
50 
51  // Return the minimum number of characters that must be used from this string
52  // to obtain a match in the UNICHARMAP.
53  int minmatch(const char *const unichar_repr) const;
54 
55  // Clear the UNICHARMAP. All previous data is lost.
56  void clear();
57 
58 private:
59  // The UNICHARMAP is represented as a tree whose nodes are of type
60  // UNICHARMAP_NODE.
61  struct UNICHARMAP_NODE {
62  UNICHARMAP_NODE();
63  ~UNICHARMAP_NODE();
64 
65  UNICHARMAP_NODE *children;
66  UNICHAR_ID id;
67  };
68 
69  UNICHARMAP_NODE *nodes;
70 };
71 
72 } // namespace tesseract
73 
74 #endif // TESSERACT_CCUTIL_UNICHARMAP_H_
int UNICHAR_ID
Definition: unichar.h:36
bool contains(const std::vector< T > &data, const T &value)
Definition: helpers.h:37
#define TESS_API
Definition: export.h:34