tesseract  5.0.0
statistc.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.h (Formerly stats.h)
3  * Description: Class description for STATS class.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
20 #define TESSERACT_CCSTRUCT_STATISTC_H_
21 
22 #include <cstdio>
23 #include "kdpair.h"
24 #include "scrollview.h"
25 
26 namespace tesseract {
27 
28 // Simple histogram-based statistics for integer values in a known
29 // range, such that the range is small compared to the number of samples.
30 class TESS_API STATS {
31 public:
32  // The histogram buckets are in the range
33  // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
34  // [min_bucket_value, max_bucket_value].
35  // Any data under min_bucket value is silently mapped to min_bucket_value,
36  // and likewise, any data over max_bucket_value is silently mapped to
37  // max_bucket_value.
38  // In the internal array, min_bucket_value maps to 0 and
39  // max_bucket_value_plus_1 - min_bucket_value to the array size.
40  // TODO(rays) This is ugly. Convert the second argument to
41  // max_bucket_value and all the code that uses it.
42  STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
43  STATS() = default; // empty for arrays
44 
45  ~STATS();
46 
47  // (Re)Sets the range and clears the counts.
48  // See the constructor for info on max and min values.
49  bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
50 
51  void clear(); // empty buckets
52 
53  void add(int32_t value, int32_t count);
54 
55  // "Accessors" return various statistics on the data.
56  int32_t mode() const; // get mode of samples
57  double mean() const; // get mean of samples
58  double sd() const; // standard deviation
59  // Returns the fractile value such that frac fraction (in [0,1]) of samples
60  // has a value less than the return value.
61  double ile(double frac) const;
62  // Returns the minimum used entry in the histogram (ie the minimum of the
63  // data, NOT the minimum of the supplied range, nor is it an index.)
64  // Would normally be called min(), but that is a reserved word in VC++.
65  int32_t min_bucket() const; // Find min
66  // Returns the maximum used entry in the histogram (ie the maximum of the
67  // data, NOT the maximum of the supplied range, nor is it an index.)
68  int32_t max_bucket() const; // Find max
69  // Finds a more useful estimate of median than ile(0.5).
70  // Overcomes a problem with ile() - if the samples are, for example,
71  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
72  // between 6 and 13 = 9.5
73  double median() const; // get median of samples
74  // Returns the count of the given value.
75  int32_t pile_count(int32_t value) const {
76  if (value <= rangemin_) {
77  return buckets_[0];
78  }
79  if (value >= rangemax_ - 1) {
80  return buckets_[rangemax_ - rangemin_ - 1];
81  }
82  return buckets_[value - rangemin_];
83  }
84  // Returns the total count of all buckets.
85  int32_t get_total() const {
86  return total_count_; // total of all piles
87  }
88  // Returns true if x is a local min.
89  bool local_min(int32_t x) const;
90 
91  // Apply a triangular smoothing filter to the stats.
92  // This makes the modes a bit more useful.
93  // The factor gives the height of the triangle, i.e. the weight of the
94  // centre.
95  void smooth(int32_t factor);
96 
97  // Cluster the samples into max_cluster clusters.
98  // Each call runs one iteration. The array of clusters must be
99  // max_clusters+1 in size as cluster 0 is used to indicate which samples
100  // have been used.
101  // The return value is the current number of clusters.
102  int32_t cluster(float lower, // thresholds
103  float upper,
104  float multiple, // distance threshold
105  int32_t max_clusters, // max no to make
106  STATS *clusters); // array of clusters
107 
108  // Finds (at most) the top max_modes modes, well actually the whole peak
109  // around each mode, returning them in the given modes vector as a <mean of
110  // peak, total count of peak> pair in order of decreasing total count. Since
111  // the mean is the key and the count the data in the pair, a single call to
112  // sort on the output will re-sort by increasing mean of peak if that is more
113  // useful than decreasing total count. Returns the actual number of modes
114  // found.
115  int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;
116 
117  // Prints a summary and table of the histogram.
118  void print() const;
119  // Prints summary stats only of the histogram.
120  void print_summary() const;
121 
122 #ifndef GRAPHICS_DISABLED
123  // Draws the histogram as a series of rectangles.
124  void plot(ScrollView *window, // window to draw in
125  float xorigin, // origin of histo
126  float yorigin, // gram
127  float xscale, // size of one unit
128  float yscale, // size of one uint
129  ScrollView::Color colour) const; // colour to draw in
130 
131  // Draws a line graph of the histogram.
132  void plotline(ScrollView *window, // window to draw in
133  float xorigin, // origin of histo
134  float yorigin, // gram
135  float xscale, // size of one unit
136  float yscale, // size of one uint
137  ScrollView::Color colour) const; // colour to draw in
138 #endif // !GRAPHICS_DISABLED
139 
140 private:
141  int32_t rangemin_ = 0; // min of range
142  // rangemax_ is not well named as it is really one past the max.
143  int32_t rangemax_ = 0; // max of range
144  int32_t total_count_ = 0; // no of samples
145  int32_t *buckets_ = nullptr; // array of cells
146 };
147 
148 } // namespace tesseract
149 
150 #endif // TESSERACT_CCSTRUCT_STATISTC_H_
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
int32_t get_total() const
Definition: statistc.h:85
#define TESS_API
Definition: export.h:34