tesseract  5.0.0
clusttool.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: clusttool.cpp
3  ** Purpose: Misc. tools for use with the clustering routines
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 #define _USE_MATH_DEFINES // for M_PI
19 
20 #include "clusttool.h"
21 
22 #include <cmath> // for M_PI, std::isnan
23 #include <locale> // for std::locale::classic
24 #include <sstream> // for std::stringstream
25 
26 namespace tesseract {
27 
28 //---------------Global Data Definitions and Declarations--------------------
29 #define TOKENSIZE 80
30 #define QUOTED_TOKENSIZE "79"
31 #define MAXSAMPLESIZE 65535
32 
45 static bool ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
46  const int kMaxLineSize = 1024;
47  char line[kMaxLineSize];
48  if (fp->FGets(line, kMaxLineSize) == nullptr) {
49  tprintf("Hit EOF in ReadNFloats!\n");
50  return false;
51  }
52 
53  std::stringstream stream(line);
54  // Use "C" locale (needed for float values Buffer[i]).
55  stream.imbue(std::locale::classic());
56  for (uint16_t i = 0; i < N; i++) {
57  float f = NAN;
58  stream >> f;
59  if (std::isnan(f)) {
60  tprintf("Read of %u floats failed!\n", N);
61  return false;
62  }
63  Buffer[i] = f;
64  }
65  return true;
66 }
67 
75 static void WriteNFloats(FILE *File, uint16_t N, float Array[]) {
76  for (int i = 0; i < N; i++) {
77  fprintf(File, " %9.6f", Array[i]);
78  }
79  fprintf(File, "\n");
80 }
81 
89 static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
90  switch (ProtoStyle) {
91  case spherical:
92  fprintf(File, "spherical");
93  break;
94  case elliptical:
95  fprintf(File, "elliptical");
96  break;
97  case mixed:
98  fprintf(File, "mixed");
99  break;
100  case automatic:
101  fprintf(File, "automatic");
102  break;
103  }
104 }
105 
114 uint16_t ReadSampleSize(TFile *fp) {
115  int SampleSize = 0;
116 
117  const int kMaxLineSize = 100;
118  char line[kMaxLineSize];
119  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
120  ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
121  ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
122  return SampleSize;
123 }
124 
134 PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
135  auto ParamDesc = new PARAM_DESC[N];
136  for (int i = 0; i < N; i++) {
137  const int kMaxLineSize = TOKENSIZE * 4;
138  char line[kMaxLineSize];
139  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
140  std::istringstream stream(line);
141  // Use "C" locale (needed for float values Min, Max).
142  stream.imbue(std::locale::classic());
143  std::string linear_token;
144  stream >> linear_token;
145  std::string essential_token;
146  stream >> essential_token;
147  stream >> ParamDesc[i].Min;
148  stream >> ParamDesc[i].Max;
149  ASSERT_HOST(!stream.fail());
150  ParamDesc[i].Circular = (linear_token[0] == 'c');
151  ParamDesc[i].NonEssential = (essential_token[0] != 'e');
152  ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
153  ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
154  ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
155  }
156  return (ParamDesc);
157 }
158 
168 PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
169  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
170  int SampleCount;
171  int i;
172 
173  const int kMaxLineSize = TOKENSIZE * 4;
174  char line[kMaxLineSize];
175  if (fp->FGets(line, kMaxLineSize) == nullptr ||
176  sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d", sig_token, shape_token,
177  &SampleCount) != 3) {
178  tprintf("Invalid prototype: %s\n", line);
179  return nullptr;
180  }
181  auto Proto = new PROTOTYPE;
182  Proto->Cluster = nullptr;
183  Proto->Significant = (sig_token[0] == 's');
184 
185  switch (shape_token[0]) {
186  case 's':
187  Proto->Style = spherical;
188  break;
189  case 'e':
190  Proto->Style = elliptical;
191  break;
192  case 'a':
193  Proto->Style = automatic;
194  break;
195  default:
196  tprintf("Invalid prototype style specification:%s\n", shape_token);
197  Proto->Style = elliptical;
198  }
199 
200  ASSERT_HOST(SampleCount >= 0);
201  Proto->NumSamples = SampleCount;
202 
203  Proto->Mean.resize(N);
204  ReadNFloats(fp, N, &Proto->Mean[0]);
205 
206  switch (Proto->Style) {
207  case spherical:
208  ReadNFloats(fp, 1, &(Proto->Variance.Spherical));
209  Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
210  Proto->TotalMagnitude = std::pow(Proto->Magnitude.Spherical, static_cast<float>(N));
211  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
212  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
213  Proto->Distrib.clear();
214  break;
215  case elliptical:
216  Proto->Variance.Elliptical = new float[N];
217  ReadNFloats(fp, N, Proto->Variance.Elliptical);
218  Proto->Magnitude.Elliptical = new float[N];
219  Proto->Weight.Elliptical = new float[N];
220  Proto->TotalMagnitude = 1.0;
221  for (i = 0; i < N; i++) {
222  Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
223  Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
224  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
225  }
226  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
227  Proto->Distrib.clear();
228  break;
229  default:
230  delete Proto;
231  tprintf("Invalid prototype style\n");
232  return nullptr;
233  }
234  return Proto;
235 }
236 
244 void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
245  int i;
246 
247  for (i = 0; i < N; i++) {
248  if (ParamDesc[i].Circular) {
249  fprintf(File, "circular ");
250  } else {
251  fprintf(File, "linear ");
252  }
253 
254  if (ParamDesc[i].NonEssential) {
255  fprintf(File, "non-essential ");
256  } else {
257  fprintf(File, "essential ");
258  }
259 
260  fprintf(File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
261  }
262 }
263 
271 void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
272  int i;
273 
274  if (Proto->Significant) {
275  fprintf(File, "significant ");
276  } else {
277  fprintf(File, "insignificant ");
278  }
279  WriteProtoStyle(File, static_cast<PROTOSTYLE>(Proto->Style));
280  fprintf(File, "%6d\n\t", Proto->NumSamples);
281  WriteNFloats(File, N, &Proto->Mean[0]);
282  fprintf(File, "\t");
283 
284  switch (Proto->Style) {
285  case spherical:
286  WriteNFloats(File, 1, &(Proto->Variance.Spherical));
287  break;
288  case elliptical:
289  WriteNFloats(File, N, Proto->Variance.Elliptical);
290  break;
291  case mixed:
292  for (i = 0; i < N; i++) {
293  switch (Proto->Distrib[i]) {
294  case normal:
295  fprintf(File, " %9s", "normal");
296  break;
297  case uniform:
298  fprintf(File, " %9s", "uniform");
299  break;
300  case D_random:
301  fprintf(File, " %9s", "random");
302  break;
303  case DISTRIBUTION_COUNT:
304  ASSERT_HOST(!"Distribution count not allowed!");
305  }
306  }
307  fprintf(File, "\n\t");
308  WriteNFloats(File, N, Proto->Variance.Elliptical);
309  }
310 }
311 
312 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define QUOTED_TOKENSIZE
Definition: clusttool.cpp:30
#define MAXSAMPLESIZE
max num of dimensions in feature space
Definition: clusttool.cpp:31
#define TOKENSIZE
max size of tokens read from an input file
Definition: clusttool.cpp:29
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:114
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:244
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:271
PROTOSTYLE
Definition: cluster.h:53
@ spherical
Definition: cluster.h:53
@ mixed
Definition: cluster.h:53
@ elliptical
Definition: cluster.h:53
@ automatic
Definition: cluster.h:53
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:168
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:134
@ D_random
Definition: cluster.h:65
@ DISTRIBUTION_COUNT
Definition: cluster.h:65
@ uniform
Definition: cluster.h:65
@ normal
Definition: cluster.h:65
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195
float * Elliptical
Definition: cluster.h:69
unsigned Style
Definition: cluster.h:79
std::vector< float > Mean
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:81
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
std::vector< DISTRIBUTION > Distrib
Definition: cluster.h:82