tesseract  5.0.0
statistc.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.cpp (Formerly stats.c)
3  * Description: Simple statistical package for integer values.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "statistc.h"
25 
26 #include "errcode.h"
27 #include "scrollview.h"
28 #include "tprintf.h"
29 
30 #include "helpers.h"
31 
32 #include <cmath>
33 #include <cstdlib>
34 #include <cstring>
35 
36 namespace tesseract {
37 
38 /**********************************************************************
39  * STATS::STATS
40  *
41  * Construct a new stats element by allocating and zeroing the memory.
42  **********************************************************************/
43 STATS::STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1) {
44  if (max_bucket_value_plus_1 <= min_bucket_value) {
45  min_bucket_value = 0;
46  max_bucket_value_plus_1 = 1;
47  }
48  rangemin_ = min_bucket_value; // setup
49  rangemax_ = max_bucket_value_plus_1;
50  buckets_ = new int32_t[rangemax_ - rangemin_];
51  clear();
52 }
53 
54 /**********************************************************************
55  * STATS::set_range
56  *
57  * Alter the range on an existing stats element.
58  **********************************************************************/
59 bool STATS::set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1) {
60  if (max_bucket_value_plus_1 <= min_bucket_value) {
61  return false;
62  }
63  if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
64  delete[] buckets_;
65  buckets_ = new int32_t[max_bucket_value_plus_1 - min_bucket_value];
66  }
67  rangemin_ = min_bucket_value; // setup
68  rangemax_ = max_bucket_value_plus_1;
69  clear(); // zero it
70  return true;
71 }
72 
73 /**********************************************************************
74  * STATS::clear
75  *
76  * Clear out the STATS class by zeroing all the buckets.
77  **********************************************************************/
78 void STATS::clear() { // clear out buckets
79  total_count_ = 0;
80  if (buckets_ != nullptr) {
81  memset(buckets_, 0, (rangemax_ - rangemin_) * sizeof(buckets_[0]));
82  }
83 }
84 
85 /**********************************************************************
86  * STATS::~STATS
87  *
88  * Destructor for a stats class.
89  **********************************************************************/
91  delete[] buckets_;
92 }
93 
94 /**********************************************************************
95  * STATS::add
96  *
97  * Add a set of samples to (or delete from) a pile.
98  **********************************************************************/
99 void STATS::add(int32_t value, int32_t count) {
100  if (buckets_ == nullptr) {
101  return;
102  }
103  value = ClipToRange(value, rangemin_, rangemax_ - 1);
104  buckets_[value - rangemin_] += count;
105  total_count_ += count; // keep count of total
106 }
107 
108 /**********************************************************************
109  * STATS::mode
110  *
111  * Find the mode of a stats class.
112  **********************************************************************/
113 int32_t STATS::mode() const { // get mode of samples
114  if (buckets_ == nullptr) {
115  return rangemin_;
116  }
117  int32_t max = buckets_[0]; // max cell count
118  int32_t maxindex = 0; // index of max
119  for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {
120  if (buckets_[index] > max) {
121  max = buckets_[index]; // find biggest
122  maxindex = index;
123  }
124  }
125  return maxindex + rangemin_; // index of biggest
126 }
127 
128 /**********************************************************************
129  * STATS::mean
130  *
131  * Find the mean of a stats class.
132  **********************************************************************/
133 double STATS::mean() const { // get mean of samples
134  if (buckets_ == nullptr || total_count_ <= 0) {
135  return static_cast<double>(rangemin_);
136  }
137  int64_t sum = 0;
138  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
139  sum += static_cast<int64_t>(index) * buckets_[index];
140  }
141  return static_cast<double>(sum) / total_count_ + rangemin_;
142 }
143 
144 /**********************************************************************
145  * STATS::sd
146  *
147  * Find the standard deviation of a stats class.
148  **********************************************************************/
149 double STATS::sd() const { // standard deviation
150  if (buckets_ == nullptr || total_count_ <= 0) {
151  return 0.0;
152  }
153  int64_t sum = 0;
154  double sqsum = 0.0;
155  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
156  sum += static_cast<int64_t>(index) * buckets_[index];
157  sqsum += static_cast<double>(index) * index * buckets_[index];
158  }
159  double variance = static_cast<double>(sum) / total_count_;
160  variance = sqsum / total_count_ - variance * variance;
161  if (variance > 0.0) {
162  return sqrt(variance);
163  }
164  return 0.0;
165 }
166 
167 /**********************************************************************
168  * STATS::ile
169  *
170  * Returns the fractile value such that frac fraction (in [0,1]) of samples
171  * has a value less than the return value.
172  **********************************************************************/
173 double STATS::ile(double frac) const {
174  if (buckets_ == nullptr || total_count_ == 0) {
175  return static_cast<double>(rangemin_);
176  }
177 #if 0
178  // TODO(rays) The existing code doesn't seem to be doing the right thing
179  // with target a double but this substitute crashes the code that uses it.
180  // Investigate and fix properly.
181  int target = IntCastRounded(frac * total_count_);
182  target = ClipToRange(target, 1, total_count_);
183 #else
184  double target = frac * total_count_;
185  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
186 #endif
187  int sum = 0;
188  int index = 0;
189  for (index = 0; index < rangemax_ - rangemin_ && sum < target; sum += buckets_[index++]) {
190  ;
191  }
192  if (index > 0) {
193  ASSERT_HOST(buckets_[index - 1] > 0);
194  return rangemin_ + index - static_cast<double>(sum - target) / buckets_[index - 1];
195  } else {
196  return static_cast<double>(rangemin_);
197  }
198 }
199 
200 /**********************************************************************
201  * STATS::min_bucket
202  *
203  * Find REAL minimum bucket - ile(0.0) isn't necessarily correct
204  **********************************************************************/
205 int32_t STATS::min_bucket() const { // Find min
206  if (buckets_ == nullptr || total_count_ == 0) {
207  return rangemin_;
208  }
209  int32_t min = 0;
210  for (min = 0; (min < rangemax_ - rangemin_) && (buckets_[min] == 0); min++) {
211  ;
212  }
213  return rangemin_ + min;
214 }
215 
216 /**********************************************************************
217  * STATS::max_bucket
218  *
219  * Find REAL maximum bucket - ile(1.0) isn't necessarily correct
220  **********************************************************************/
221 
222 int32_t STATS::max_bucket() const { // Find max
223  if (buckets_ == nullptr || total_count_ == 0) {
224  return rangemin_;
225  }
226  int32_t max;
227  for (max = rangemax_ - rangemin_ - 1; max > 0 && buckets_[max] == 0; max--) {
228  ;
229  }
230  return rangemin_ + max;
231 }
232 
233 /**********************************************************************
234  * STATS::median
235  *
236  * Finds a more useful estimate of median than ile(0.5).
237  *
238  * Overcomes a problem with ile() - if the samples are, for example,
239  * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
240  * between 6 and 13 = 9.5
241  **********************************************************************/
242 double STATS::median() const { // get median
243  if (buckets_ == nullptr) {
244  return static_cast<double>(rangemin_);
245  }
246  double median = ile(0.5);
247  int median_pile = static_cast<int>(floor(median));
248  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
249  int32_t min_pile;
250  int32_t max_pile;
251  /* Find preceding non zero pile */
252  for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--) {
253  ;
254  }
255  /* Find following non zero pile */
256  for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++) {
257  ;
258  }
259  median = (min_pile + max_pile) / 2.0;
260  }
261  return median;
262 }
263 
264 /**********************************************************************
265  * STATS::local_min
266  *
267  * Return true if this point is a local min.
268  **********************************************************************/
269 bool STATS::local_min(int32_t x) const {
270  if (buckets_ == nullptr) {
271  return false;
272  }
273  x = ClipToRange(x, rangemin_, rangemax_ - 1) - rangemin_;
274  if (buckets_[x] == 0) {
275  return true;
276  }
277  int32_t index; // table index
278  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index) {
279  ;
280  }
281  if (index >= 0 && buckets_[index] < buckets_[x]) {
282  return false;
283  }
284  for (index = x + 1; index < rangemax_ - rangemin_ && buckets_[index] == buckets_[x]; ++index) {
285  ;
286  }
287  if (index < rangemax_ - rangemin_ && buckets_[index] < buckets_[x]) {
288  return false;
289  } else {
290  return true;
291  }
292 }
293 
294 /**********************************************************************
295  * STATS::smooth
296  *
297  * Apply a triangular smoothing filter to the stats.
298  * This makes the modes a bit more useful.
299  * The factor gives the height of the triangle, i.e. the weight of the
300  * centre.
301  **********************************************************************/
302 void STATS::smooth(int32_t factor) {
303  if (buckets_ == nullptr || factor < 2) {
304  return;
305  }
306  STATS result(rangemin_, rangemax_);
307  int entrycount = rangemax_ - rangemin_;
308  for (int entry = 0; entry < entrycount; entry++) {
309  // centre weight
310  int count = buckets_[entry] * factor;
311  for (int offset = 1; offset < factor; offset++) {
312  if (entry - offset >= 0) {
313  count += buckets_[entry - offset] * (factor - offset);
314  }
315  if (entry + offset < entrycount) {
316  count += buckets_[entry + offset] * (factor - offset);
317  }
318  }
319  result.add(entry + rangemin_, count);
320  }
321  total_count_ = result.total_count_;
322  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
323 }
324 
325 /**********************************************************************
326  * STATS::cluster
327  *
328  * Cluster the samples into max_cluster clusters.
329  * Each call runs one iteration. The array of clusters must be
330  * max_clusters+1 in size as cluster 0 is used to indicate which samples
331  * have been used.
332  * The return value is the current number of clusters.
333  **********************************************************************/
334 
335 int32_t STATS::cluster(float lower, // thresholds
336  float upper,
337  float multiple, // distance threshold
338  int32_t max_clusters, // max no to make
339  STATS *clusters) { // array of clusters
340  bool new_cluster; // added one
341  float *centres; // cluster centres
342  int32_t entry; // bucket index
343  int32_t cluster; // cluster index
344  int32_t best_cluster; // one to assign to
345  int32_t new_centre = 0; // residual mode
346  int32_t new_mode; // pile count of new_centre
347  int32_t count; // pile to place
348  float dist; // from cluster
349  float min_dist; // from best_cluster
350  int32_t cluster_count; // no of clusters
351 
352  if (buckets_ == nullptr || max_clusters < 1) {
353  return 0;
354  }
355  centres = new float[max_clusters + 1];
356  for (cluster_count = 1;
357  cluster_count <= max_clusters && clusters[cluster_count].buckets_ != nullptr &&
358  clusters[cluster_count].total_count_ > 0;
359  cluster_count++) {
360  centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
361  new_centre = clusters[cluster_count].mode();
362  for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
363  pile_count(entry) <= pile_count(entry + 1);
364  entry--) {
365  count = pile_count(entry) - clusters[0].pile_count(entry);
366  if (count > 0) {
367  clusters[cluster_count].add(entry, count);
368  clusters[0].add(entry, count);
369  }
370  }
371  for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry < rangemax_ &&
372  pile_count(entry) <= pile_count(entry - 1);
373  entry++) {
374  count = pile_count(entry) - clusters[0].pile_count(entry);
375  if (count > 0) {
376  clusters[cluster_count].add(entry, count);
377  clusters[0].add(entry, count);
378  }
379  }
380  }
381  cluster_count--;
382 
383  if (cluster_count == 0) {
384  clusters[0].set_range(rangemin_, rangemax_);
385  }
386  do {
387  new_cluster = false;
388  new_mode = 0;
389  for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
390  count = buckets_[entry] - clusters[0].buckets_[entry];
391  // remaining pile
392  if (count > 0) { // any to handle
393  min_dist = static_cast<float>(INT32_MAX);
394  best_cluster = 0;
395  for (cluster = 1; cluster <= cluster_count; cluster++) {
396  dist = entry + rangemin_ - centres[cluster];
397  // find distance
398  if (dist < 0) {
399  dist = -dist;
400  }
401  if (dist < min_dist) {
402  min_dist = dist; // find least
403  best_cluster = cluster;
404  }
405  }
406  if (min_dist > upper // far enough for new
407  && (best_cluster == 0 || entry + rangemin_ > centres[best_cluster] * multiple ||
408  entry + rangemin_ < centres[best_cluster] / multiple)) {
409  if (count > new_mode) {
410  new_mode = count;
411  new_centre = entry + rangemin_;
412  }
413  }
414  }
415  }
416  // need new and room
417  if (new_mode > 0 && cluster_count < max_clusters) {
418  cluster_count++;
419  new_cluster = true;
420  if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
421  delete[] centres;
422  return 0;
423  }
424  centres[cluster_count] = static_cast<float>(new_centre);
425  clusters[cluster_count].add(new_centre, new_mode);
426  clusters[0].add(new_centre, new_mode);
427  for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
428  pile_count(entry) <= pile_count(entry + 1);
429  entry--) {
430  count = pile_count(entry) - clusters[0].pile_count(entry);
431  if (count > 0) {
432  clusters[cluster_count].add(entry, count);
433  clusters[0].add(entry, count);
434  }
435  }
436  for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry < rangemax_ &&
437  pile_count(entry) <= pile_count(entry - 1);
438  entry++) {
439  count = pile_count(entry) - clusters[0].pile_count(entry);
440  if (count > 0) {
441  clusters[cluster_count].add(entry, count);
442  clusters[0].add(entry, count);
443  }
444  }
445  centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
446  }
447  } while (new_cluster && cluster_count < max_clusters);
448  delete[] centres;
449  return cluster_count;
450 }
451 
452 // Helper tests that the current index is still part of the peak and gathers
453 // the data into the peak, returning false when the peak is ended.
454 // src_buckets[index] - used_buckets[index] is the unused part of the histogram.
455 // prev_count is the histogram count of the previous index on entry and is
456 // updated to the current index on return.
457 // total_count and total_value are accumulating the mean of the peak.
458 static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int *prev_count,
459  int *total_count, double *total_value) {
460  int pile_count = src_buckets[index] - used_buckets[index];
461  if (pile_count <= *prev_count && pile_count > 0) {
462  // Accumulate count and index.count product.
463  *total_count += pile_count;
464  *total_value += index * pile_count;
465  // Mark this index as used
466  used_buckets[index] = src_buckets[index];
467  *prev_count = pile_count;
468  return true;
469  } else {
470  return false;
471  }
472 }
473 
474 // Finds (at most) the top max_modes modes, well actually the whole peak around
475 // each mode, returning them in the given modes vector as a <mean of peak,
476 // total count of peak> pair in order of decreasing total count.
477 // Since the mean is the key and the count the data in the pair, a single call
478 // to sort on the output will re-sort by increasing mean of peak if that is
479 // more useful than decreasing total count.
480 // Returns the actual number of modes found.
481 int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {
482  if (max_modes <= 0) {
483  return 0;
484  }
485  int src_count = rangemax_ - rangemin_;
486  // Used copies the counts in buckets_ as they get used.
487  STATS used(rangemin_, rangemax_);
488  modes.clear();
489  // Total count of the smallest peak found so far.
490  int least_count = 1;
491  // Mode that is used as a seed for each peak
492  int max_count = 0;
493  do {
494  // Find an unused mode.
495  max_count = 0;
496  int max_index = 0;
497  for (int src_index = 0; src_index < src_count; src_index++) {
498  int pile_count = buckets_[src_index] - used.buckets_[src_index];
499  if (pile_count > max_count) {
500  max_count = pile_count;
501  max_index = src_index;
502  }
503  }
504  if (max_count > 0) {
505  // Copy the bucket count to used so it doesn't get found again.
506  used.buckets_[max_index] = max_count;
507  // Get the entire peak.
508  double total_value = max_index * max_count;
509  int total_count = max_count;
510  int prev_pile = max_count;
511  for (int offset = 1; max_index + offset < src_count; ++offset) {
512  if (!GatherPeak(max_index + offset, buckets_, used.buckets_, &prev_pile, &total_count,
513  &total_value)) {
514  break;
515  }
516  }
517  prev_pile = buckets_[max_index];
518  for (int offset = 1; max_index - offset >= 0; ++offset) {
519  if (!GatherPeak(max_index - offset, buckets_, used.buckets_, &prev_pile, &total_count,
520  &total_value)) {
521  break;
522  }
523  }
524  if (total_count > least_count || modes.size() < static_cast<size_t>(max_modes)) {
525  // We definitely want this mode, so if we have enough discard the least.
526  if (modes.size() == static_cast<size_t>(max_modes)) {
527  modes.resize(max_modes - 1);
528  }
529  size_t target_index = 0;
530  // Linear search for the target insertion point.
531  while (target_index < modes.size() && modes[target_index].data() >= total_count) {
532  ++target_index;
533  }
534  auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);
535  modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));
536  least_count = modes.back().data();
537  }
538  }
539  } while (max_count > 0);
540  return modes.size();
541 }
542 
543 /**********************************************************************
544  * STATS::print
545  *
546  * Prints a summary and table of the histogram.
547  **********************************************************************/
548 void STATS::print() const {
549  if (buckets_ == nullptr) {
550  return;
551  }
552  int32_t min = min_bucket() - rangemin_;
553  int32_t max = max_bucket() - rangemin_;
554 
555  int num_printed = 0;
556  for (int index = min; index <= max; index++) {
557  if (buckets_[index] != 0) {
558  tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
559  if (++num_printed % 8 == 0) {
560  tprintf("\n");
561  }
562  }
563  }
564  tprintf("\n");
565  print_summary();
566 }
567 
568 /**********************************************************************
569  * STATS::print_summary
570  *
571  * Print a summary of the stats.
572  **********************************************************************/
573 void STATS::print_summary() const {
574  if (buckets_ == nullptr) {
575  return;
576  }
577  int32_t min = min_bucket();
578  int32_t max = max_bucket();
579  tprintf("Total count=%d\n", total_count_);
580  tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
581  tprintf("Lower quartile=%.2f\n", ile(0.25));
582  tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
583  tprintf("Upper quartile=%.2f\n", ile(0.75));
584  tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
585  tprintf("Range=%d\n", max + 1 - min);
586  tprintf("Mean= %.2f\n", mean());
587  tprintf("SD= %.2f\n", sd());
588 }
589 
590 /**********************************************************************
591  * STATS::plot
592  *
593  * Draw a histogram of the stats table.
594  **********************************************************************/
595 
596 #ifndef GRAPHICS_DISABLED
597 void STATS::plot(ScrollView *window, // to draw in
598  float xorigin, // bottom left
599  float yorigin,
600  float xscale, // one x unit
601  float yscale, // one y unit
602  ScrollView::Color colour) const { // colour to draw in
603  if (buckets_ == nullptr) {
604  return;
605  }
606  window->Pen(colour);
607 
608  for (int index = 0; index < rangemax_ - rangemin_; index++) {
609  window->Rectangle(xorigin + xscale * index, yorigin, xorigin + xscale * (index + 1),
610  yorigin + yscale * buckets_[index]);
611  }
612 }
613 #endif
614 
615 /**********************************************************************
616  * STATS::plotline
617  *
618  * Draw a histogram of the stats table. (Line only)
619  **********************************************************************/
620 
621 #ifndef GRAPHICS_DISABLED
622 void STATS::plotline(ScrollView *window, // to draw in
623  float xorigin, // bottom left
624  float yorigin,
625  float xscale, // one x unit
626  float yscale, // one y unit
627  ScrollView::Color colour) const { // colour to draw in
628  if (buckets_ == nullptr) {
629  return;
630  }
631  window->Pen(colour);
632  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
633  for (int index = 0; index < rangemax_ - rangemin_; index++) {
634  window->DrawTo(xorigin + xscale * index, yorigin + yscale * buckets_[index]);
635  }
636 }
637 #endif
638 
639 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:110
void print_summary() const
Definition: statistc.cpp:573
void print() const
Definition: statistc.cpp:548
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:597
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:59
double median() const
Definition: statistc.cpp:242
int32_t min_bucket() const
Definition: statistc.cpp:205
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:335
bool local_min(int32_t x) const
Definition: statistc.cpp:269
int32_t mode() const
Definition: statistc.cpp:113
void smooth(int32_t factor)
Definition: statistc.cpp:302
int32_t max_bucket() const
Definition: statistc.cpp:222
int top_n_modes(int max_modes, std::vector< KDPairInc< float, int >> &modes) const
Definition: statistc.cpp:481
double ile(double frac) const
Definition: statistc.cpp:173
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:622
double sd() const
Definition: statistc.cpp:149
double mean() const
Definition: statistc.cpp:133
void Pen(Color color)
Definition: scrollview.cpp:723
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:589
void SetCursor(int x, int y)
Definition: scrollview.cpp:498
void DrawTo(int x, int y)
Definition: scrollview.cpp:504