tesseract  5.0.0
tesseract::SampleIterator Class Reference

#include <sampleiterator.h>

Public Member Functions

 SampleIterator ()
 
 ~SampleIterator ()
 
void Clear ()
 
void Init (const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
 
void Begin ()
 
bool AtEnd () const
 
const TrainingSampleGetSample () const
 
TrainingSampleMutableSample () const
 
int GlobalSampleIndex () const
 
int GetCompactClassID () const
 
int GetSparseClassID () const
 
void Next ()
 
int CompactCharsetSize () const
 
int SparseCharsetSize () const
 
const IndexMapBiDicharset_map () const
 
const ShapeTableshape_table () const
 
const TrainingSampleSetsample_set () const
 
void MapSampleFeatures (const IntFeatureMap &feature_map)
 
int UniformSamples ()
 
double NormalizeSamples ()
 

Detailed Description

Definition at line 91 of file sampleiterator.h.

Constructor & Destructor Documentation

◆ SampleIterator()

tesseract::SampleIterator::SampleIterator ( )

Definition at line 29 of file sampleiterator.cpp.

30  : charset_map_(nullptr)
31  , shape_table_(nullptr)
32  , sample_set_(nullptr)
33  , randomize_(false)
34  , owned_shape_table_(nullptr) {
35  num_shapes_ = 0;
36  Begin();
37 }

◆ ~SampleIterator()

tesseract::SampleIterator::~SampleIterator ( )

Definition at line 39 of file sampleiterator.cpp.

39  {
40  Clear();
41 }

Member Function Documentation

◆ AtEnd()

bool tesseract::SampleIterator::AtEnd ( ) const

Definition at line 98 of file sampleiterator.cpp.

98  {
99  return shape_index_ >= num_shapes_;
100 }

◆ Begin()

void tesseract::SampleIterator::Begin ( )

Definition at line 86 of file sampleiterator.cpp.

86  {
87  shape_index_ = -1;
88  shape_char_index_ = 0;
89  num_shape_chars_ = 0;
90  shape_font_index_ = 0;
91  num_shape_fonts_ = 0;
92  sample_index_ = 0;
93  num_samples_ = 0;
94  // Find the first indexable sample.
95  Next();
96 }

◆ charset_map()

const IndexMapBiDi& tesseract::SampleIterator::charset_map ( ) const
inline

Definition at line 134 of file sampleiterator.h.

134  {
135  return *charset_map_;
136  }

◆ Clear()

void tesseract::SampleIterator::Clear ( )

Definition at line 43 of file sampleiterator.cpp.

43  {
44  delete owned_shape_table_;
45  owned_shape_table_ = nullptr;
46 }

◆ CompactCharsetSize()

int tesseract::SampleIterator::CompactCharsetSize ( ) const

Definition at line 195 of file sampleiterator.cpp.

195  {
196  return charset_map_ != nullptr ? charset_map_->CompactSize() : SparseCharsetSize();
197 }
int CompactSize() const
Definition: indexmapbidi.h:63

◆ GetCompactClassID()

int tesseract::SampleIterator::GetCompactClassID ( ) const

Definition at line 141 of file sampleiterator.cpp.

141  {
142  return charset_map_ != nullptr ? charset_map_->SparseToCompact(shape_index_) : GetSparseClassID();
143 }
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:140

◆ GetSample()

const TrainingSample & tesseract::SampleIterator::GetSample ( ) const

Definition at line 102 of file sampleiterator.cpp.

102  {
103  if (shape_table_ != nullptr) {
104  const UnicharAndFonts *shape_entry = GetShapeEntry();
105  int char_id = shape_entry->unichar_id;
106  int font_id = shape_entry->font_ids[shape_font_index_];
107  return *sample_set_->GetSample(font_id, char_id, sample_index_);
108  } else {
109  return *sample_set_->GetSample(shape_index_);
110  }
111 }
const TrainingSample * GetSample(int index) const

◆ GetSparseClassID()

int tesseract::SampleIterator::GetSparseClassID ( ) const

Definition at line 148 of file sampleiterator.cpp.

148  {
149  return shape_table_ != nullptr ? shape_index_ : GetSample().class_id();
150 }
UNICHAR_ID class_id() const
const TrainingSample & GetSample() const

◆ GlobalSampleIndex()

int tesseract::SampleIterator::GlobalSampleIndex ( ) const

Definition at line 126 of file sampleiterator.cpp.

126  {
127  if (shape_table_ != nullptr) {
128  const UnicharAndFonts *shape_entry = GetShapeEntry();
129  int char_id = shape_entry->unichar_id;
130  int font_id = shape_entry->font_ids[shape_font_index_];
131  return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
132  } else {
133  return shape_index_;
134  }
135 }
int GlobalSampleIndex(int font_id, int class_id, int index) const

◆ Init()

void tesseract::SampleIterator::Init ( const IndexMapBiDi charset_map,
const ShapeTable shape_table,
bool  randomize,
TrainingSampleSet sample_set 
)

Definition at line 49 of file sampleiterator.cpp.

50  {
51  Clear();
52  charset_map_ = charset_map;
53  shape_table_ = shape_table;
54  sample_set_ = sample_set;
55  randomize_ = randomize;
56  if (shape_table_ == nullptr && charset_map_ != nullptr) {
57  // The caller wishes to iterate by class. The easiest way to do this
58  // is to create a dummy shape_table_ that we will own.
59  int num_fonts = sample_set_->NumFonts();
60  owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
61  int charsetsize = sample_set_->unicharset().size();
62  for (int c = 0; c < charsetsize; ++c) {
63  // We always add a shape for each character to keep the index in sync
64  // with the unichar_id.
65  int shape_id = owned_shape_table_->AddShape(c, 0);
66  for (int f = 1; f < num_fonts; ++f) {
67  if (sample_set_->NumClassSamples(f, c, true) > 0) {
68  owned_shape_table_->AddToShape(shape_id, c, f);
69  }
70  }
71  }
72  shape_table_ = owned_shape_table_;
73  }
74  if (shape_table_ != nullptr) {
75  num_shapes_ = shape_table_->NumShapes();
76  } else {
77  num_shapes_ = randomize ? sample_set_->num_samples() : sample_set_->num_raw_samples();
78  }
79  Begin();
80 }
size_t size() const
Definition: unicharset.h:355
void AddToShape(unsigned shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:383
unsigned AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:351
unsigned NumShapes() const
Definition: shapetable.h:248
const TrainingSampleSet * sample_set() const
const IndexMapBiDi & charset_map() const
const ShapeTable * shape_table() const
int NumClassSamples(int font_id, int class_id, bool randomize) const
const UNICHARSET & unicharset() const

◆ MapSampleFeatures()

void tesseract::SampleIterator::MapSampleFeatures ( const IntFeatureMap feature_map)

Definition at line 219 of file sampleiterator.cpp.

219  {
220  for (Begin(); !AtEnd(); Next()) {
221  TrainingSample *sample = MutableSample();
222  MapFeatures(*sample, feature_map);
223  }
224 }
TrainingSample * MutableSample() const

◆ MutableSample()

TrainingSample * tesseract::SampleIterator::MutableSample ( ) const

Definition at line 113 of file sampleiterator.cpp.

113  {
114  if (shape_table_ != nullptr) {
115  const UnicharAndFonts *shape_entry = GetShapeEntry();
116  int char_id = shape_entry->unichar_id;
117  int font_id = shape_entry->font_ids[shape_font_index_];
118  return sample_set_->MutableSample(font_id, char_id, sample_index_);
119  } else {
120  return sample_set_->mutable_sample(shape_index_);
121  }
122 }
TrainingSample * mutable_sample(int index)
TrainingSample * MutableSample(int font_id, int class_id, int index)

◆ Next()

void tesseract::SampleIterator::Next ( )

Definition at line 154 of file sampleiterator.cpp.

154  {
155  if (shape_table_ != nullptr) {
156  // Next sample in this class/font combination.
157  ++sample_index_;
158  if (sample_index_ < num_samples_) {
159  return;
160  }
161  // Next font in this class in this shape.
162  sample_index_ = 0;
163  do {
164  ++shape_font_index_;
165  if (shape_font_index_ >= num_shape_fonts_) {
166  // Next unichar in this shape.
167  shape_font_index_ = 0;
168  ++shape_char_index_;
169  if (shape_char_index_ >= num_shape_chars_) {
170  // Find the next shape that is mapped in the charset_map_.
171  shape_char_index_ = 0;
172  do {
173  ++shape_index_;
174  } while (shape_index_ < num_shapes_ && charset_map_ != nullptr &&
175  charset_map_->SparseToCompact(shape_index_) < 0);
176  if (shape_index_ >= num_shapes_) {
177  return; // The end.
178  }
179  num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
180  }
181  }
182  const UnicharAndFonts *shape_entry = GetShapeEntry();
183  num_shape_fonts_ = shape_entry->font_ids.size();
184  int char_id = shape_entry->unichar_id;
185  int font_id = shape_entry->font_ids[shape_font_index_];
186  num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
187  } while (num_samples_ == 0);
188  } else {
189  // We are just iterating over the samples.
190  ++shape_index_;
191  }
192 }
int size() const
Definition: shapetable.h:169
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292

◆ NormalizeSamples()

double tesseract::SampleIterator::NormalizeSamples ( )

Definition at line 241 of file sampleiterator.cpp.

241  {
242  double total_weight = 0.0;
243  int sample_count = 0;
244  for (Begin(); !AtEnd(); Next()) {
245  const TrainingSample &sample = GetSample();
246  total_weight += sample.weight();
247  ++sample_count;
248  }
249  // Normalize samples.
250  double min_assigned_sample_weight = 1.0;
251  if (total_weight > 0.0) {
252  for (Begin(); !AtEnd(); Next()) {
253  TrainingSample *sample = MutableSample();
254  double weight = sample->weight() / total_weight;
255  if (weight < min_assigned_sample_weight) {
256  min_assigned_sample_weight = weight;
257  }
258  sample->set_weight(weight);
259  }
260  }
261  return min_assigned_sample_weight;
262 }

◆ sample_set()

const TrainingSampleSet* tesseract::SampleIterator::sample_set ( ) const
inline

Definition at line 141 of file sampleiterator.h.

141  {
142  return sample_set_;
143  }

◆ shape_table()

const ShapeTable* tesseract::SampleIterator::shape_table ( ) const
inline

Definition at line 137 of file sampleiterator.h.

137  {
138  return shape_table_;
139  }

◆ SparseCharsetSize()

int tesseract::SampleIterator::SparseCharsetSize ( ) const

Definition at line 200 of file sampleiterator.cpp.

200  {
201  return charset_map_ != nullptr
202  ? charset_map_->SparseSize()
203  : (shape_table_ != nullptr ? shape_table_->NumShapes() : sample_set_->charsetsize());
204 }
int SparseSize() const override
Definition: indexmapbidi.h:144

◆ UniformSamples()

int tesseract::SampleIterator::UniformSamples ( )

Definition at line 228 of file sampleiterator.cpp.

228  {
229  int num_good_samples = 0;
230  for (Begin(); !AtEnd(); Next()) {
231  TrainingSample *sample = MutableSample();
232  sample->set_weight(1.0);
233  ++num_good_samples;
234  }
236  return num_good_samples;
237 }

The documentation for this class was generated from the following files: