tesseract  5.0.0
cluster.h
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.h
3  ** Purpose: Definition of feature space clustering routines
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 #ifndef CLUSTER_H
19 #define CLUSTER_H
20 
21 #include "kdtree.h"
22 #include "oldlist.h"
23 
24 namespace tesseract {
25 
26 struct BUCKETS;
27 
28 #define MINBUCKETS 5
29 #define MAXBUCKETS 39
30 
31 /*----------------------------------------------------------------------
32  Types
33 ----------------------------------------------------------------------*/
34 struct CLUSTER {
35  CLUSTER(size_t n) : Mean(n) {
36  }
37 
39  delete Left;
40  delete Right;
41  }
42 
43  bool Clustered : 1; // true if included in a higher cluster
44  bool Prototype : 1; // true if cluster represented by a proto
45  unsigned SampleCount : 30; // number of samples in this cluster
46  CLUSTER *Left; // ptr to left sub-cluster
47  CLUSTER *Right; // ptr to right sub-cluster
48  int32_t CharID; // identifier of char sample came from
49  std::vector<float> Mean; // mean of cluster - SampleSize floats
50 };
51 using SAMPLE = CLUSTER; // can refer to as either sample or cluster
52 
54 
55 struct CLUSTERCONFIG { // parameters to control clustering
56  PROTOSTYLE ProtoStyle; // specifies types of protos to be made
57  float MinSamples; // min # of samples per proto - % of total
58  float MaxIllegal; // max percentage of samples in a cluster which
59  // have more than 1 feature in that cluster
60  float Independence; // desired independence between dimensions
61  double Confidence; // desired confidence in prototypes created
62  int MagicSamples; // Ideal number of samples in a cluster.
63 };
64 
66 
67 union FLOATUNION {
68  float Spherical;
69  float *Elliptical;
70 };
71 
72 struct PROTOTYPE {
73  bool Significant : 1; // true if prototype is significant
74  bool Merged : 1; // Merged after clustering so do not output
75  // but kept for display purposes. If it has no
76  // samples then it was actually merged.
77  // Otherwise it matched an already significant
78  // cluster.
79  unsigned Style : 2; // spherical, elliptical, or mixed
80  unsigned NumSamples : 28; // number of samples in the cluster
81  CLUSTER *Cluster; // ptr to cluster which made prototype
82  std::vector<DISTRIBUTION> Distrib; // different distribution for each dimension
83  std::vector<float> Mean; // prototype mean
84  float TotalMagnitude; // total magnitude over all dimensions
85  float LogMagnitude; // log base e of TotalMagnitude
86  FLOATUNION Variance; // prototype variance
87  FLOATUNION Magnitude; // magnitude of density function
88  FLOATUNION Weight; // weight of density function
89 };
90 
91 struct CLUSTERER {
92  int16_t SampleSize; // number of parameters per sample
93  PARAM_DESC *ParamDesc; // description of each parameter
94  int32_t NumberOfSamples; // total number of samples being clustered
95  KDTREE *KDTree; // for optimal nearest neighbor searching
96  CLUSTER *Root; // ptr to root cluster of cluster tree
97  LIST ProtoList; // list of prototypes
98  uint32_t NumChar; // # of characters represented by samples
99  // cache of reusable histograms by distribution type and number of buckets.
101 };
102 
103 struct SAMPLELIST {
104  int32_t NumSamples; // number of samples in list
105  int32_t MaxNumSamples; // maximum size of list
106  SAMPLE *Sample[1]; // array of ptrs to sample data structures
107 };
108 
109 // low level cluster tree analysis routines.
110 #define InitSampleSearch(S, C) (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))
111 
112 /*--------------------------------------------------------------------------
113  Public Function Prototypes
114 --------------------------------------------------------------------------*/
115 TESS_API
116 CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);
117 
118 TESS_API
119 SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID);
120 
121 TESS_API
122 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
123 
124 TESS_API
125 void FreeClusterer(CLUSTERER *Clusterer);
126 
127 TESS_API
128 void FreeProtoList(LIST *ProtoList);
129 
130 void FreePrototype(void *arg); // PROTOTYPE *Prototype);
131 
132 CLUSTER *NextSample(LIST *SearchState);
133 
134 float Mean(PROTOTYPE *Proto, uint16_t Dimension);
135 
136 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension);
137 
138 TESS_API
139 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
140  float m1[], float m2[]);
141 
142 } // namespace tesseract
143 
144 #endif
#define MAXBUCKETS
Definition: cluster.h:29
#define MINBUCKETS
Definition: cluster.h:28
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:1871
list_rec * LIST
Definition: oldlist.h:125
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1663
CLUSTERCONFIG Config
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:1639
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1598
void FreePrototype(void *arg)
Definition: cluster.cpp:1609
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1441
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1674
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1576
PROTOSTYLE
Definition: cluster.h:53
@ spherical
Definition: cluster.h:53
@ mixed
Definition: cluster.h:53
@ elliptical
Definition: cluster.h:53
@ automatic
Definition: cluster.h:53
CLUSTER SAMPLE
Definition: cluster.h:51
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1492
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1544
DISTRIBUTION
Definition: cluster.h:65
@ D_random
Definition: cluster.h:65
@ DISTRIBUTION_COUNT
Definition: cluster.h:65
@ uniform
Definition: cluster.h:65
@ normal
Definition: cluster.h:65
CLUSTER * Right
Definition: cluster.h:47
CLUSTER(size_t n)
Definition: cluster.h:35
int32_t CharID
Definition: cluster.h:48
unsigned SampleCount
Definition: cluster.h:45
CLUSTER * Left
Definition: cluster.h:46
std::vector< float > Mean
Definition: cluster.h:49
PROTOSTYLE ProtoStyle
Definition: cluster.h:56
float * Elliptical
Definition: cluster.h:69
float TotalMagnitude
Definition: cluster.h:84
unsigned Style
Definition: cluster.h:79
std::vector< float > Mean
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:81
FLOATUNION Magnitude
Definition: cluster.h:87
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
std::vector< DISTRIBUTION > Distrib
Definition: cluster.h:82
FLOATUNION Weight
Definition: cluster.h:88
int16_t SampleSize
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:96
PARAM_DESC * ParamDesc
Definition: cluster.h:93
KDTREE * KDTree
Definition: cluster.h:95
uint32_t NumChar
Definition: cluster.h:98
int32_t NumberOfSamples
Definition: cluster.h:94
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:100
SAMPLE * Sample[1]
Definition: cluster.h:106
int32_t MaxNumSamples
Definition: cluster.h:105
#define TESS_API
Definition: export.h:34