tesseract  5.0.0
simddetect.cpp
Go to the documentation of this file.
1 // File: simddetect.cpp
3 // Description: Architecture detector.
4 // Author: Stefan Weil (based on code from Ray Smith)
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #ifdef HAVE_CONFIG_H
19 # include "config_auto.h" // for HAVE_AVX, ...
20 #endif
21 #include <numeric> // for std::inner_product
22 #include "dotproduct.h"
23 #include "intsimdmatrix.h" // for IntSimdMatrix
24 #include "params.h" // for STRING_VAR
25 #include "simddetect.h"
26 #include "tprintf.h" // for tprintf
27 
28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29 // The GNU compiler g++ fails to compile with the Accelerate framework
30 // (tested with versions 10 and 11), so unconditionally disable it.
31 #undef HAVE_FRAMEWORK_ACCELERATE
32 #endif
33 
34 #if defined(HAVE_FRAMEWORK_ACCELERATE)
35 
36 // Use Apple Accelerate framework.
37 // https://developer.apple.com/documentation/accelerate/simd
38 
39 #include <Accelerate/Accelerate.h>
40 
41 #endif
42 
43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44 # define HAS_CPUID
45 #endif
46 
47 #if defined(HAS_CPUID)
48 # if defined(__GNUC__)
49 # include <cpuid.h>
50 # elif defined(_WIN32)
51 # include <intrin.h>
52 # endif
53 #endif
54 
55 #if defined(HAVE_NEON) && !defined(__aarch64__)
56 # ifdef ANDROID
57 # include <cpu-features.h>
58 # else
59 /* Assume linux */
60 #ifndef __riscos__
61 # include <asm/hwcap.h>
62 # include <sys/auxv.h>
63 #endif
64 # endif
65 #endif
66 
67 namespace tesseract {
68 
69 // Computes and returns the dot product of the two n-vectors u and v.
70 // Note: because the order of addition is different among the different dot
71 // product functions, the results can (and do) vary slightly (although they
72 // agree to within about 4e-15). This produces different results when running
73 // training, despite all random inputs being precisely equal.
74 // To get consistent results, use just one of these dot product functions.
75 // On a test multi-layer network, serial is 57% slower than SSE, and AVX
76 // is about 8% faster than SSE. This suggests that the time is memory
77 // bandwidth constrained and could benefit from holding the reused vector
78 // in AVX registers.
80 
81 static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
82 
83 SIMDDetect SIMDDetect::detector;
84 
85 #if defined(__aarch64__)
86 // ARMv8 always has NEON.
87 bool SIMDDetect::neon_available_ = true;
88 #elif defined(HAVE_NEON)
89 // If true, then Neon has been detected.
90 bool SIMDDetect::neon_available_;
91 #else
92 // If true, then AVX has been detected.
93 bool SIMDDetect::avx_available_;
94 bool SIMDDetect::avx2_available_;
95 bool SIMDDetect::avx512F_available_;
96 bool SIMDDetect::avx512BW_available_;
97 // If true, then FMA has been detected.
98 bool SIMDDetect::fma_available_;
99 // If true, then SSe4.1 has been detected.
100 bool SIMDDetect::sse_available_;
101 #endif
102 
103 #if defined(HAVE_FRAMEWORK_ACCELERATE)
104 static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
105  TFloat total = 0;
106  const int stride = 1;
107 #if defined(FAST_FLOAT)
108  vDSP_dotpr(u, stride, v, stride, &total, n);
109 #else
110  vDSP_dotprD(u, stride, v, stride, &total, n);
111 #endif
112  return total;
113 }
114 #endif
115 
116 // Computes and returns the dot product of the two n-vectors u and v.
117 static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
118  TFloat total = 0;
119  for (int k = 0; k < n; ++k) {
120  total += u[k] * v[k];
121  }
122  return total;
123 }
124 
125 // Compute dot product using std::inner_product.
126 static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
127  return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
128 }
129 
130 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
131  DotProduct = f;
133 }
134 
135 // Constructor.
136 // Tests the architecture in a system-dependent way to detect AVX, SSE and
137 // any other available SIMD equipment.
138 // __GNUC__ is also defined by compilers that include GNU extensions such as
139 // clang.
140 SIMDDetect::SIMDDetect() {
141  // The fallback is a generic dot product calculation.
142  SetDotProduct(DotProductGeneric);
143 
144 #if defined(HAS_CPUID)
145 # if defined(__GNUC__)
146  unsigned int eax, ebx, ecx, edx;
147  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
148  // Note that these tests all use hex because the older compilers don't have
149  // the newer flags.
150 # if defined(HAVE_SSE4_1)
151  sse_available_ = (ecx & 0x00080000) != 0;
152 # endif
153 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
154  auto xgetbv = []() {
155  uint32_t xcr0;
156  __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
157  return xcr0;
158  };
159  if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
160  // OSXSAVE bit is set, XMM state and YMM state are fine.
161 # if defined(HAVE_FMA)
162  fma_available_ = (ecx & 0x00001000) != 0;
163 # endif
164 # if defined(HAVE_AVX)
165  avx_available_ = (ecx & 0x10000000) != 0;
166  if (avx_available_) {
167  // There is supposed to be a __get_cpuid_count function, but this is all
168  // there is in my cpuid.h. It is a macro for an asm statement and cannot
169  // be used inside an if.
170  __cpuid_count(7, 0, eax, ebx, ecx, edx);
171  avx2_available_ = (ebx & 0x00000020) != 0;
172  avx512F_available_ = (ebx & 0x00010000) != 0;
173  avx512BW_available_ = (ebx & 0x40000000) != 0;
174  }
175 # endif
176  }
177 # endif
178  }
179 # elif defined(_WIN32)
180  int cpuInfo[4];
181  int max_function_id;
182  __cpuid(cpuInfo, 0);
183  max_function_id = cpuInfo[0];
184  if (max_function_id >= 1) {
185  __cpuid(cpuInfo, 1);
186 # if defined(HAVE_SSE4_1)
187  sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
188 # endif
189 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
190  if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
191  // OSXSAVE bit is set, XMM state and YMM state are fine.
192 # if defined(HAVE_FMA)
193  fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
194 # endif
195 # if defined(HAVE_AVX)
196  avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
197 # endif
198 # if defined(HAVE_AVX2)
199  if (max_function_id >= 7) {
200  __cpuid(cpuInfo, 7);
201  avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
202  avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
203  avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
204  }
205 # endif
206  }
207 # endif
208  }
209 # else
210 # error "I don't know how to test for SIMD with this compiler"
211 # endif
212 #endif
213 
214 #if defined(HAVE_NEON) && !defined(__aarch64__)
215 # ifdef ANDROID
216  {
217  AndroidCpuFamily family = android_getCpuFamily();
218  if (family == ANDROID_CPU_FAMILY_ARM)
219  neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
220  }
221 # else
222  /* Assume linux */
223 #ifndef __riscos__
224  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
225 #else
226  neon_available_ = 1;
227 #endif
228 # endif
229 #endif
230 
231  // Select code for calculation of dot product based on autodetection.
232  if (false) {
233  // This is a dummy to support conditional compilation.
234 #if defined(HAVE_AVX2)
235  } else if (avx2_available_) {
236  // AVX2 detected.
238 #endif
239 #if defined(HAVE_AVX)
240  } else if (avx_available_) {
241  // AVX detected.
243 #endif
244 #if defined(HAVE_SSE4_1)
245  } else if (sse_available_) {
246  // SSE detected.
248 #endif
249 #if defined(HAVE_NEON) || defined(__aarch64__)
250  } else if (neon_available_) {
251  // NEON detected.
253 #endif
254  }
255 
256  const char *dotproduct_env = getenv("DOTPRODUCT");
257  if (dotproduct_env != nullptr) {
258  // Override automatic settings by value from environment variable.
259  dotproduct = dotproduct_env;
260  Update();
261  }
262 }
263 
265  // Select code for calculation of dot product based on the
266  // value of the config variable if that value is not empty.
267  const char *dotproduct_method = "generic";
268  if (dotproduct == "auto") {
269  // Automatic detection. Nothing to be done.
270  } else if (dotproduct == "generic") {
271  // Generic code selected by config variable.
272  SetDotProduct(DotProductGeneric);
273  dotproduct_method = "generic";
274  } else if (dotproduct == "native") {
275  // Native optimized code selected by config variable.
277  dotproduct_method = "native";
278 #if defined(HAVE_AVX2)
279  } else if (dotproduct == "avx2") {
280  // AVX2 selected by config variable.
282  dotproduct_method = "avx2";
283 #endif
284 #if defined(HAVE_AVX)
285  } else if (dotproduct == "avx") {
286  // AVX selected by config variable.
288  dotproduct_method = "avx";
289 #endif
290 #if defined(HAVE_FMA)
291  } else if (dotproduct == "fma") {
292  // FMA selected by config variable.
294  dotproduct_method = "fma";
295 #endif
296 #if defined(HAVE_SSE4_1)
297  } else if (dotproduct == "sse") {
298  // SSE selected by config variable.
300  dotproduct_method = "sse";
301 #endif
302 #if defined(HAVE_FRAMEWORK_ACCELERATE)
303  } else if (dotproduct == "accelerate") {
304  SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
305 #endif
306 #if defined(HAVE_NEON) || defined(__aarch64__)
307  } else if (dotproduct == "neon" && neon_available_) {
308  // NEON selected by config variable.
310  dotproduct_method = "neon";
311 #endif
312  } else if (dotproduct == "std::inner_product") {
313  // std::inner_product selected by config variable.
314  SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
315  dotproduct_method = "std::inner_product";
316  } else {
317  // Unsupported value of config variable.
318  tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
319  dotproduct.c_str());
320  tprintf(
321  "Supported values for dotproduct: auto generic native"
322 #if defined(HAVE_AVX2)
323  " avx2"
324 #endif
325 #if defined(HAVE_AVX)
326  " avx"
327 #endif
328 #if defined(HAVE_FMA)
329  " fma"
330 #endif
331 #if defined(HAVE_SSE4_1)
332  " sse"
333 #endif
334 #if defined(HAVE_FRAMEWORK_ACCELERATE)
335  " accelerate"
336 #endif
337  " std::inner_product.\n");
338  }
339 
340  dotproduct.set_value(dotproduct_method);
341 }
342 
343 } // namespace tesseract
#define STRING_VAR(name, val, comment)
Definition: params.h:362
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
Definition: simddetect.h:26
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
Definition: dotproduct.cpp:22
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
Definition: simddetect.cpp:79
double TFloat
Definition: tesstypes.h:39
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()
Definition: simddetect.cpp:264