tesseract  5.0.0
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 /*----------------------------------------------------------------------------
18  Include Files and Type Defines
19 ----------------------------------------------------------------------------*/
20 #include "normmatch.h"
21 
22 #include "classify.h"
23 #include "clusttool.h"
24 #include "helpers.h"
25 #include "normfeat.h"
26 #include "params.h"
27 #include "unicharset.h"
28 
29 #include <cmath>
30 #include <cstdio>
31 #include <sstream> // for std::istringstream
32 
33 namespace tesseract {
34 
35 struct NORM_PROTOS {
36  NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
37  }
38  int NumParams = 0;
39  int NumProtos;
40  PARAM_DESC *ParamDesc = nullptr;
41  std::vector<LIST> Protos;
42 };
43 
44 /*----------------------------------------------------------------------------
45  Private Code
46 ----------------------------------------------------------------------------*/
47 
55 static double NormEvidenceOf(double NormAdj) {
56  NormAdj /= classify_norm_adj_midpoint;
57 
58  if (classify_norm_adj_curl == 3) {
59  NormAdj = NormAdj * NormAdj * NormAdj;
60  } else if (classify_norm_adj_curl == 2) {
61  NormAdj = NormAdj * NormAdj;
62  } else {
63  NormAdj = pow(NormAdj, classify_norm_adj_curl);
64  }
65  return (1.0 / (1.0 + NormAdj));
66 }
67 
68 /*----------------------------------------------------------------------------
69  Variables
70 ----------------------------------------------------------------------------*/
71 
73 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
74 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
76 const double kWidthErrorWeighting = 0.125;
77 
78 /*----------------------------------------------------------------------------
79  Public Code
80 ----------------------------------------------------------------------------*/
94 float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
95  if (ClassId >= NormProtos->NumProtos) {
96  ClassId = NO_CLASS;
97  }
98 
99  /* handle requests for classification as noise */
100  if (ClassId == NO_CLASS) {
101  /* kludge - clean up constants and make into control knobs later */
102  float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
103  feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
104  feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
105  return (1.0f - NormEvidenceOf(Match));
106  }
107 
108  float BestMatch = FLT_MAX;
109  LIST Protos = NormProtos->Protos[ClassId];
110 
111  if (DebugMatch) {
112  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
113  }
114 
115  int ProtoId = 0;
116  iterate(Protos) {
117  auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
118  float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
119  float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
120  if (DebugMatch) {
121  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
122  Proto->Weight.Elliptical[CharNormY], Match);
123  }
124  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
125  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
126  if (DebugMatch) {
127  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
128  Proto->Weight.Elliptical[CharNormRx], Match);
129  }
130  // Ry is width! See intfx.cpp.
131  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
132  if (DebugMatch) {
133  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
134  Proto->Weight.Elliptical[CharNormRy]);
135  }
136  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
137  Delta *= kWidthErrorWeighting;
138  Match += Delta;
139  if (DebugMatch) {
140  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
141  Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
142  256 * (1 - NormEvidenceOf(Match)));
143  }
144 
145  if (Match < BestMatch) {
146  BestMatch = Match;
147  }
148 
149  ProtoId++;
150  }
151  return 1.0 - NormEvidenceOf(BestMatch);
152 } /* ComputeNormMatch */
153 
155  if (NormProtos != nullptr) {
156  for (int i = 0; i < NormProtos->NumProtos; i++) {
158  }
159  delete[] NormProtos->ParamDesc;
160  delete NormProtos;
161  NormProtos = nullptr;
162  }
163 }
164 
174  char unichar[2 * UNICHAR_LEN + 1];
175  UNICHAR_ID unichar_id;
176  LIST Protos;
177  int NumProtos;
178 
179  /* allocate and initialization data structure */
180  auto NormProtos = new NORM_PROTOS(unicharset.size());
181 
182  /* read file header and save in data structure */
185 
186  /* read protos for each class into a separate list */
187  const int kMaxLineSize = 100;
188  char line[kMaxLineSize];
189  while (fp->FGets(line, kMaxLineSize) != nullptr) {
190  std::istringstream stream(line);
191  stream.imbue(std::locale::classic());
192  stream >> unichar >> NumProtos;
193  if (stream.fail()) {
194  continue;
195  }
196  if (unicharset.contains_unichar(unichar)) {
197  unichar_id = unicharset.unichar_to_id(unichar);
198  Protos = NormProtos->Protos[unichar_id];
199  for (int i = 0; i < NumProtos; i++) {
200  Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
201  }
202  NormProtos->Protos[unichar_id] = Protos;
203  } else {
204  tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
205  for (int i = 0; i < NumProtos; i++) {
207  }
208  }
209  }
210  return NormProtos;
211 } /* ReadNormProtos */
212 
213 } // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:33
#define double_VAR(name, val, comment)
Definition: params.h:365
#define iterate(l)
Definition: oldlist.h:91
#define NO_CLASS
Definition: matchdefs.h:35
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:114
double classify_norm_adj_curl
Definition: normmatch.cpp:74
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const double kWidthErrorWeighting
Definition: normmatch.cpp:76
double classify_norm_adj_midpoint
Definition: normmatch.cpp:73
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
int UNICHAR_ID
Definition: unichar.h:36
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1598
void FreePrototype(void *arg)
Definition: cluster.cpp:1609
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:192
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:168
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:134
UNICHARSET unicharset
Definition: ccutil.h:61
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:173
NORM_PROTOS * NormProtos
Definition: classify.h:433
std::vector< LIST > Protos
Definition: normmatch.cpp:41
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:40
std::vector< float > Params
Definition: ocrfeatures.h:66
list_rec * first_node()
Definition: oldlist.h:107