tesseract  5.0.0
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #define _USE_MATH_DEFINES // for M_PI
15 
16 #include "commontraining.h"
17 
18 #ifdef DISABLED_LEGACY_ENGINE
19 
20 # include "params.h"
21 # include "tprintf.h"
22 
23 namespace tesseract {
24 
25 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
26 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
27 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
28 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
29 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
30 STRING_PARAM_FLAG(X, "", "File listing font xheights");
31 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
32 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
33 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
34 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
35 STRING_PARAM_FLAG(fonts_dir, "",
36  "If empty it uses system default. Otherwise it overrides "
37  "system default font location");
38 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir");
39 
51 void ParseArguments(int *argc, char ***argv) {
52  std::string usage;
53  if (*argc) {
54  usage += (*argv)[0];
55  usage += " -v | --version | ";
56  usage += (*argv)[0];
57  }
58  usage += " [.tr files ...]";
59  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
60 }
61 
62 } // namespace tesseract.
63 
64 #else
65 
66 # include <allheaders.h>
67 # include "ccutil.h"
68 # include "classify.h"
69 # include "cluster.h"
70 # include "clusttool.h"
71 # include "featdefs.h"
72 # include "fontinfo.h"
73 # include "intfeaturespace.h"
74 # include "mastertrainer.h"
75 # include "mf.h"
76 # include "oldlist.h"
77 # include "params.h"
78 # include "shapetable.h"
79 # include "tessdatamanager.h"
80 # include "tprintf.h"
81 # include "unicity_table.h"
82 
83 namespace tesseract {
84 
85 // Global Variables.
86 
87 // global variable to hold configuration parameters to control clustering
88 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
89 CLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0};
91 static CCUtil ccutil;
92 
93 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
94 static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
95 static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
96 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
97 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
98 STRING_PARAM_FLAG(X, "", "File listing font xheights");
99 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
100 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
101 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
102 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
103 STRING_PARAM_FLAG(fonts_dir, "", "");
104 STRING_PARAM_FLAG(fontconfig_tmpdir, "", "");
105 static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
106  "Min number of samples per proto as % of total");
107 static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
108  "Max percentage of samples in a cluster which have more"
109  " than 1 feature in that cluster");
110 static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
111  "Desired independence between dimensions");
112 static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
113  "Desired confidence in prototypes created");
114 
125 void ParseArguments(int *argc, char ***argv) {
126  std::string usage;
127  if (*argc) {
128  usage += (*argv)[0];
129  usage += " -v | --version | ";
130  usage += (*argv)[0];
131  }
132  usage += " [.tr files ...]";
133  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
134  // Set some global values based on the flags.
136  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
137  Config.MaxIllegal = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
138  Config.Independence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
139  Config.Confidence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
140  // Set additional parameters from config file if specified.
141  if (!FLAGS_configfile.empty()) {
143  FLAGS_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, ccutil.params());
144  }
145 }
146 
147 // Helper loads shape table from the given file.
148 ShapeTable *LoadShapeTable(const std::string &file_prefix) {
149  ShapeTable *shape_table = nullptr;
150  std::string shape_table_file = file_prefix;
151  shape_table_file += kShapeTableFileSuffix;
152  TFile shape_fp;
153  if (shape_fp.Open(shape_table_file.c_str(), nullptr)) {
154  shape_table = new ShapeTable;
155  if (!shape_table->DeSerialize(&shape_fp)) {
156  delete shape_table;
157  shape_table = nullptr;
158  tprintf("Error: Failed to read shape table %s\n", shape_table_file.c_str());
159  } else {
160  int num_shapes = shape_table->NumShapes();
161  tprintf("Read shape table %s of %d shapes\n", shape_table_file.c_str(), num_shapes);
162  }
163  } else {
164  tprintf("Warning: No shape table file present: %s\n", shape_table_file.c_str());
165  }
166  return shape_table;
167 }
168 
169 // Helper to write the shape_table.
170 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table) {
171  std::string shape_table_file = file_prefix;
172  shape_table_file += kShapeTableFileSuffix;
173  FILE *fp = fopen(shape_table_file.c_str(), "wb");
174  if (fp != nullptr) {
175  if (!shape_table.Serialize(fp)) {
176  fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.c_str());
177  }
178  fclose(fp);
179  } else {
180  fprintf(stderr, "Error creating shape table: %s\n", shape_table_file.c_str());
181  }
182 }
183 
200 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
201  ShapeTable **shape_table, std::string &file_prefix) {
203  InitIntegerFX();
204  file_prefix = "";
205  if (!FLAGS_D.empty()) {
206  file_prefix += FLAGS_D.c_str();
207  file_prefix += "/";
208  }
209  // If we are shape clustering (nullptr shape_table) or we successfully load
210  // a shape_table written by a previous shape clustering, then
211  // shape_analysis will be true, meaning that the MasterTrainer will replace
212  // some members of the unicharset with their fragments.
213  bool shape_analysis = false;
214  if (shape_table != nullptr) {
215  *shape_table = LoadShapeTable(file_prefix);
216  if (*shape_table != nullptr) {
217  shape_analysis = true;
218  }
219  } else {
220  shape_analysis = true;
221  }
222  auto trainer = std::make_unique<MasterTrainer>(NM_CHAR_ANISOTROPIC, shape_analysis, replication,
223  FLAGS_debug_level);
224  IntFeatureSpace fs;
226  trainer->LoadUnicharset(FLAGS_U.c_str());
227  // Get basic font information from font_properties.
228  if (!FLAGS_F.empty()) {
229  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
230  return {};
231  }
232  }
233  if (!FLAGS_X.empty()) {
234  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
235  return {};
236  }
237  }
238  trainer->SetFeatureSpace(fs);
239  // Load training data from .tr files in filelist (terminated by nullptr).
240  for (const char *page_name = *filelist++; page_name != nullptr; page_name = *filelist++) {
241  tprintf("Reading %s ...\n", page_name);
242  trainer->ReadTrainingSamples(page_name, feature_defs, false);
243 
244  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
245  // read font spacing information in to fontinfo_table.
246  int pagename_len = strlen(page_name);
247  char *fontinfo_file_name = new char[pagename_len + 7];
248  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
249  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
250  trainer->AddSpacingInfo(fontinfo_file_name);
251  delete[] fontinfo_file_name;
252 
253  // Load the images into memory if required by the classifier.
254  if (FLAGS_load_images) {
255  std::string image_name = page_name;
256  // Chop off the tr and replace with tif. Extension must be tif!
257  image_name.resize(image_name.length() - 2);
258  image_name += "tif";
259  trainer->LoadPageImages(image_name.c_str());
260  }
261  }
262  trainer->PostLoadCleanup();
263  // Write the master trainer if required.
264  if (!FLAGS_output_trainer.empty()) {
265  FILE *fp = fopen(FLAGS_output_trainer.c_str(), "wb");
266  if (fp == nullptr) {
267  tprintf("Can't create saved trainer data!\n");
268  } else {
269  trainer->Serialize(fp);
270  fclose(fp);
271  }
272  }
273  trainer->PreTrainingSetup();
274  if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
275  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
276  return {};
277  }
278 
279  if (shape_table != nullptr) {
280  // If we previously failed to load a shapetable, then shape clustering
281  // wasn't run so make a flat one now.
282  if (*shape_table == nullptr) {
283  *shape_table = new ShapeTable;
284  trainer->SetupFlatShapeTable(*shape_table);
285  tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().c_str());
286  }
287  (*shape_table)->set_unicharset(trainer->unicharset());
288  }
289  return trainer;
290 }
291 
292 /*---------------------------------------------------------------------------*/
302 LABELEDLIST FindList(LIST List, const std::string &Label) {
303  LABELEDLIST LabeledList;
304 
305  iterate(List) {
306  LabeledList = reinterpret_cast<LABELEDLIST>(List->first_node());
307  if (LabeledList->Label == Label) {
308  return (LabeledList);
309  }
310  }
311  return (nullptr);
312 
313 } /* FindList */
314 
315 /*---------------------------------------------------------------------------*/
316 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
317 // the new method or get rid of it entirely.
330 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,
331  int max_samples, UNICHARSET *unicharset, FILE *file,
332  LIST *training_samples) {
333  char buffer[2048];
334  char unichar[UNICHAR_LEN + 1];
335  LABELEDLIST char_sample;
336  FEATURE_SET feature_samples;
337  uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);
338 
339  // Zero out the font_sample_count for all the classes.
340  LIST it = *training_samples;
341  iterate(it) {
342  char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());
343  char_sample->font_sample_count = 0;
344  }
345 
346  while (fgets(buffer, 2048, file) != nullptr) {
347  if (buffer[0] == '\n') {
348  continue;
349  }
350 
351  sscanf(buffer, "%*s %s", unichar);
352  if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
353  unicharset->unichar_insert(unichar);
354  if (unicharset->size() > MAX_NUM_CLASSES) {
355  tprintf(
356  "Error: Size of unicharset in training is "
357  "greater than MAX_NUM_CLASSES\n");
358  exit(1);
359  }
360  }
361  char_sample = FindList(*training_samples, unichar);
362  if (char_sample == nullptr) {
363  char_sample = new LABELEDLISTNODE(unichar);
364  *training_samples = push(*training_samples, char_sample);
365  }
366  auto char_desc = ReadCharDescription(feature_definitions, file);
367  feature_samples = char_desc->FeatureSets[feature_type];
368  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
369  char_sample->List = push(char_sample->List, feature_samples);
370  char_sample->SampleCount++;
371  char_sample->font_sample_count++;
372  } else {
373  delete feature_samples;
374  }
375  for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
376  if (feature_type != i) {
377  delete char_desc->FeatureSets[i];
378  }
379  }
380  delete char_desc;
381  }
382 } // ReadTrainingSamples
383 
384 /*---------------------------------------------------------------------------*/
390 void FreeTrainingSamples(LIST CharList) {
391  LABELEDLIST char_sample;
392  FEATURE_SET FeatureSet;
393  LIST FeatureList;
394 
395  LIST nodes = CharList;
396  iterate(CharList) { /* iterate through all of the fonts */
397  char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
398  FeatureList = char_sample->List;
399  iterate(FeatureList) { /* iterate through all of the classes */
400  FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
401  delete FeatureSet;
402  }
403  FreeLabeledList(char_sample);
404  }
405  destroy(nodes);
406 } /* FreeTrainingSamples */
407 
408 /*---------------------------------------------------------------------------*/
416 void FreeLabeledList(LABELEDLIST LabeledList) {
417  destroy(LabeledList->List);
418  delete LabeledList;
419 } /* FreeLabeledList */
420 
421 /*---------------------------------------------------------------------------*/
434  const char *program_feature_type) {
435  uint16_t N;
436  CLUSTERER *Clusterer;
437  LIST FeatureList = nullptr;
438  FEATURE_SET FeatureSet = nullptr;
439 
440  int32_t desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
441  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
442  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
443 
444  FeatureList = char_sample->List;
445  uint32_t CharID = 0;
446  std::vector<float> Sample;
447  iterate(FeatureList) {
448  FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
449  for (int i = 0; i < FeatureSet->MaxNumFeatures; i++) {
450  if (Sample.empty()) {
451  Sample.resize(N);
452  }
453  for (int j = 0; j < N; j++) {
454  Sample[j] = FeatureSet->Features[i]->Params[j];
455  }
456  MakeSample(Clusterer, &Sample[0], CharID);
457  }
458  CharID++;
459  }
460  return Clusterer;
461 
462 } /* SetUpForClustering */
463 
464 /*------------------------------------------------------------------------*/
465 void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer,
466  CLUSTERCONFIG *clusterconfig) {
467  PROTOTYPE *Prototype;
468  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
469 
470  LIST pProtoList = ProtoList;
471  iterate(pProtoList) {
472  Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
473  if (Prototype->Significant || Prototype->Merged) {
474  continue;
475  }
476  float best_dist = 0.125;
477  PROTOTYPE *best_match = nullptr;
478  // Find the nearest alive prototype.
479  LIST list_it = ProtoList;
480  iterate(list_it) {
481  auto *test_p = reinterpret_cast<PROTOTYPE *>(list_it->first_node());
482  if (test_p != Prototype && !test_p->Merged) {
483  float dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, &Prototype->Mean[0],
484  &test_p->Mean[0]);
485  if (dist < best_dist) {
486  best_match = test_p;
487  best_dist = dist;
488  }
489  }
490  }
491  if (best_match != nullptr && !best_match->Significant) {
492  if (debug) {
493  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", best_match->NumSamples,
494  Prototype->NumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0],
495  Prototype->Mean[1]);
496  }
497  best_match->NumSamples =
498  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples,
499  Prototype->NumSamples, &best_match->Mean[0], &best_match->Mean[0], &Prototype->Mean[0]);
500  Prototype->NumSamples = 0;
501  Prototype->Merged = true;
502  } else if (best_match != nullptr) {
503  if (debug) {
504  tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0],
505  Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]);
506  }
507  Prototype->Merged = true;
508  }
509  }
510  // Mark significant those that now have enough samples.
511  int min_samples = static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
512  pProtoList = ProtoList;
513  iterate(pProtoList) {
514  Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
515  // Process insignificant protos that do not match a green one
516  if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) {
517  if (debug) {
518  tprintf("Red proto at %g,%g becoming green\n", Prototype->Mean[0], Prototype->Mean[1]);
519  }
520  Prototype->Significant = true;
521  }
522  }
523 } /* MergeInsignificantProtos */
524 
525 /*-----------------------------------------------------------------------------*/
526 void CleanUpUnusedData(LIST ProtoList) {
527  PROTOTYPE *Prototype;
528 
529  iterate(ProtoList) {
530  Prototype = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
531  delete[] Prototype->Variance.Elliptical;
532  Prototype->Variance.Elliptical = nullptr;
533  delete[] Prototype->Magnitude.Elliptical;
534  Prototype->Magnitude.Elliptical = nullptr;
535  delete[] Prototype->Weight.Elliptical;
536  Prototype->Weight.Elliptical = nullptr;
537  }
538 }
539 
540 /*------------------------------------------------------------------------*/
541 LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
542 
543 {
544  LIST NewProtoList = NIL_LIST;
545  auto pProtoList = ProtoList;
546  iterate(pProtoList) {
547  auto Proto = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
548  if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) {
549  auto NewProto = new PROTOTYPE;
550  NewProto->Mean = Proto->Mean;
551  NewProto->Significant = Proto->Significant;
552  NewProto->Style = Proto->Style;
553  NewProto->NumSamples = Proto->NumSamples;
554  NewProto->Cluster = nullptr;
555  NewProto->Distrib.clear();
556 
557  if (Proto->Variance.Elliptical != nullptr) {
558  NewProto->Variance.Elliptical = new float[N];
559  for (int i = 0; i < N; i++) {
560  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
561  }
562  } else {
563  NewProto->Variance.Elliptical = nullptr;
564  }
565  //---------------------------------------------
566  if (Proto->Magnitude.Elliptical != nullptr) {
567  NewProto->Magnitude.Elliptical = new float[N];
568  for (int i = 0; i < N; i++) {
569  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
570  }
571  } else {
572  NewProto->Magnitude.Elliptical = nullptr;
573  }
574  //------------------------------------------------
575  if (Proto->Weight.Elliptical != nullptr) {
576  NewProto->Weight.Elliptical = new float[N];
577  for (int i = 0; i < N; i++) {
578  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
579  }
580  } else {
581  NewProto->Weight.Elliptical = nullptr;
582  }
583 
584  NewProto->TotalMagnitude = Proto->TotalMagnitude;
585  NewProto->LogMagnitude = Proto->LogMagnitude;
586  NewProtoList = push_last(NewProtoList, NewProto);
587  }
588  }
589  FreeProtoList(&ProtoList);
590  return (NewProtoList);
591 } /* RemoveInsignificantProtos */
592 
593 /*----------------------------------------------------------------------------*/
594 MERGE_CLASS FindClass(LIST List, const std::string &Label) {
595  MERGE_CLASS MergeClass;
596 
597  iterate(List) {
598  MergeClass = reinterpret_cast<MERGE_CLASS>(List->first_node());
599  if (MergeClass->Label == Label) {
600  return (MergeClass);
601  }
602  }
603  return (nullptr);
604 
605 } /* FindClass */
606 
607 /*-----------------------------------------------------------------------------*/
613 void FreeLabeledClassList(LIST ClassList) {
614  MERGE_CLASS MergeClass;
615 
616  LIST nodes = ClassList;
617  iterate(ClassList) /* iterate through all of the fonts */
618  {
619  MergeClass = reinterpret_cast<MERGE_CLASS>(ClassList->first_node());
620  FreeClass(MergeClass->Class);
621  delete MergeClass;
622  }
623  destroy(nodes);
624 
625 } /* FreeLabeledClassList */
626 
627 /* SetUpForFloat2Int */
628 CLASS_STRUCT *SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList) {
629  MERGE_CLASS MergeClass;
630  CLASS_TYPE Class;
631  int NumProtos;
632  int NumConfigs;
633  int NumWords;
634  int i, j;
635  float Values[3];
636  PROTO_STRUCT *NewProto;
637  PROTO_STRUCT *OldProto;
638  BIT_VECTOR NewConfig;
639  BIT_VECTOR OldConfig;
640 
641  // printf("Float2Int ...\n");
642 
643  auto *float_classes = new CLASS_STRUCT[unicharset.size()];
644  iterate(LabeledClassList) {
645  UnicityTable<int> font_set;
646  MergeClass = reinterpret_cast<MERGE_CLASS>(LabeledClassList->first_node());
647  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label.c_str())];
648  NumProtos = MergeClass->Class->NumProtos;
649  NumConfigs = MergeClass->Class->NumConfigs;
650  font_set.move(&MergeClass->Class->font_set);
651  Class->NumProtos = NumProtos;
652  Class->MaxNumProtos = NumProtos;
653  Class->Prototypes.resize(NumProtos);
654  for (i = 0; i < NumProtos; i++) {
655  NewProto = ProtoIn(Class, i);
656  OldProto = ProtoIn(MergeClass->Class, i);
657  Values[0] = OldProto->X;
658  Values[1] = OldProto->Y;
659  Values[2] = OldProto->Angle;
660  Normalize(Values);
661  NewProto->X = OldProto->X;
662  NewProto->Y = OldProto->Y;
663  NewProto->Length = OldProto->Length;
664  NewProto->Angle = OldProto->Angle;
665  NewProto->A = Values[0];
666  NewProto->B = Values[1];
667  NewProto->C = Values[2];
668  }
669 
670  Class->NumConfigs = NumConfigs;
671  Class->MaxNumConfigs = NumConfigs;
672  Class->font_set.move(&font_set);
673  Class->Configurations.resize(NumConfigs);
674  NumWords = WordsInVectorOfSize(NumProtos);
675  for (i = 0; i < NumConfigs; i++) {
676  NewConfig = NewBitVector(NumProtos);
677  OldConfig = MergeClass->Class->Configurations[i];
678  for (j = 0; j < NumWords; j++) {
679  NewConfig[j] = OldConfig[j];
680  }
681  Class->Configurations[i] = NewConfig;
682  }
683  }
684  return float_classes;
685 } // SetUpForFloat2Int
686 
687 /*--------------------------------------------------------------------------*/
688 void Normalize(float *Values) {
689  float Slope;
690  float Intercept;
691  float Normalizer;
692 
693  Slope = tan(Values[2] * 2 * M_PI);
694  Intercept = Values[1] - Slope * Values[0];
695  Normalizer = 1 / sqrt(Slope * Slope + 1.0);
696 
697  Values[0] = Slope * Normalizer;
698  Values[1] = -Normalizer;
699  Values[2] = Intercept * Normalizer;
700 } // Normalize
701 
702 /*-------------------------------------------------------------------------*/
703 void FreeNormProtoList(LIST CharList)
704 
705 {
706  LABELEDLIST char_sample;
707 
708  LIST nodes = CharList;
709  iterate(CharList) /* iterate through all of the fonts */
710  {
711  char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
712  FreeLabeledList(char_sample);
713  }
714  destroy(nodes);
715 
716 } // FreeNormProtoList
717 
718 /*---------------------------------------------------------------------------*/
719 void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName) {
720  auto LabeledProtoList = new LABELEDLISTNODE(CharName.c_str());
721  iterate(ProtoList) {
722  auto Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
723  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
724  }
725  *NormProtoList = push(*NormProtoList, LabeledProtoList);
726 }
727 
728 /*---------------------------------------------------------------------------*/
729 int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos) {
730  int N = 0;
731  iterate(ProtoList) {
732  auto *Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
733  if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) {
734  N++;
735  }
736  }
737  return (N);
738 }
739 
740 } // namespace tesseract.
741 
742 #endif // def DISABLED_LEGACY_ENGINE
#define UNICHAR_LEN
Definition: unichar.h:33
const int kBoostXYBuckets
const int kBoostDirBuckets
#define ProtoIn(Class, Pid)
Definition: protos.h:70
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define DOUBLE_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
MERGE_CLASS FindClass(LIST List, const std::string &Label)
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:42
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203
void Normalize(float *Values)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:400
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:172
void FreeLabeledList(LABELEDLIST LabeledList)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:1871
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
LIST destroy(LIST list)
Definition: oldlist.cpp:121
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
ShapeTable * LoadShapeTable(const std::string &file_prefix)
void InitIntegerFX()
Definition: intfx.cpp:54
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:87
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1598
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1441
void FreeTrainingSamples(LIST CharList)
void CleanUpUnusedData(LIST ProtoList)
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:192
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:119
@ elliptical
Definition: cluster.h:53
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
@ NM_CHAR_ANISOTROPIC
Definition: normalis.h:49
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1492
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void FreeLabeledClassList(LIST ClassList)
LABELEDLIST FindList(LIST List, const std::string &Label)
void move(UnicityTable< T > *from)
Definition: unicity_table.h:98
ParamsVectors * params()
Definition: ccutil.h:53
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
float * Elliptical
Definition: cluster.h:69
std::vector< float > Mean
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:87
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:88
int16_t SampleSize
Definition: cluster.h:92
PARAM_DESC * ParamDesc
Definition: cluster.h:93
uint32_t NumChar
Definition: cluster.h:98
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
std::vector< FEATURE_STRUCT * > Features
Definition: ocrfeatures.h:85
std::vector< BIT_VECTOR > Configurations
Definition: protos.h:46
UnicityTable< int > font_set
Definition: protos.h:47
int16_t MaxNumConfigs
Definition: protos.h:44
int16_t MaxNumProtos
Definition: protos.h:42
std::vector< PROTO_STRUCT > Prototypes
Definition: protos.h:45
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
unsigned NumShapes() const
Definition: shapetable.h:248
list_rec * first_node()
Definition: oldlist.h:107
tesseract::CLASS_TYPE Class