tesseract  5.0.0
baseapi_thread_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 // Unit test to run Tesseract instances in parallel threads and verify
13 // the OCR result.
14 
15 // Note that success of running this test as-is does NOT verify
16 // thread-safety. For that, you need to run this binary under TSAN using the
17 // associated baseapi_thread_test_with_tsan.sh script.
18 //
19 // The tests are partitioned by instance to allow running Tesseract/Cube/both
20 // and by stage to run initialization/recognition/both. See flag descriptions
21 // for details.
22 
23 #include <functional>
24 #include <memory>
25 #include <string>
26 #ifdef INCLUDE_TENSORFLOW
27 # include <tensorflow/core/lib/core/threadpool.h>
28 #endif
29 #include <allheaders.h>
30 #include <tesseract/baseapi.h>
31 #include "commandlineflags.h"
32 #include "include_gunit.h"
33 #include "log.h"
34 #include "image.h"
35 
36 // Run with Tesseract instances.
37 BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances");
38 // Run with Cube instances.
39 // Note that with TSAN, Cube typically takes much longer to test. Ignoring
40 // std::string operations using the associated tess_tsan.ignore file when
41 // testing Cube significantly reduces testing time.
42 BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances");
43 
44 // When used with TSAN, having more repetitions can help in finding hidden
45 // thread-safety violations at the expense of increased testing time.
46 INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.");
47 
48 INT_PARAM_FLAG(max_concurrent_instances, 0,
49  "Maximum number of instances to run in parallel at any given "
50  "instant. The number of concurrent instances cannot exceed "
51  "reps * number_of_langs_tested, which is also the default value.");
52 
53 namespace tesseract {
54 
55 static const char *kTessLangs[] = {"eng", "vie", nullptr};
56 static const char *kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
57 static const char *kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", nullptr};
58 
59 static const char *kCubeLangs[] = {"hin", "ara", nullptr};
60 static const char *kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
61 static const char *kCubeTruthText[] = {"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
62  "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
63 
64 class BaseapiThreadTest : public ::testing::Test {
65 protected:
66  static void SetUpTestCase() {
67  CHECK(FLAGS_test_tesseract || FLAGS_test_cube)
68  << "Need to test at least one of Tesseract/Cube!";
69  // Form a list of langs/gt_text/image_files we will work with.
70  std::vector<std::string> image_files;
71  if (FLAGS_test_tesseract) {
72  int i = 0;
73  while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {
74  langs_.emplace_back(kTessLangs[i]);
75  gt_text_.emplace_back(kTessTruthText[i]);
76  image_files.emplace_back(kTessImages[i]);
77  ++i;
78  }
79  LOG(INFO) << "Testing Tesseract on " << i << " languages.";
80  }
81  if (FLAGS_test_cube) {
82  int i = 0;
83  while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {
84  langs_.emplace_back(kCubeLangs[i]);
85  gt_text_.emplace_back(kCubeTruthText[i]);
86  image_files.emplace_back(kCubeImages[i]);
87  ++i;
88  }
89  LOG(INFO) << "Testing Cube on " << i << " languages.";
90  }
91  num_langs_ = langs_.size();
92 
93  // Pre-load the images into an array. We will be making multiple copies of
94  // an image here if FLAGS_reps > 1 and that is intentional. In this test, we
95  // wish to not make any assumptions about the thread-safety of Pix objects,
96  // and so entirely disallow concurrent access of a Pix instance.
97  const int n = num_langs_ * FLAGS_reps;
98  for (int i = 0; i < n; ++i) {
99  std::string path = TESTING_DIR "/" + image_files[i % num_langs_];
100  Image new_pix = pixRead(path.c_str());
101  QCHECK(new_pix != nullptr) << "Could not read " << path;
102  pix_.push_back(new_pix);
103  }
104 
105 #ifdef INCLUDE_TENSORFLOW
106  pool_size_ = (FLAGS_max_concurrent_instances < 1) ? num_langs_ * FLAGS_reps
107  : FLAGS_max_concurrent_instances;
108 #endif
109  }
110 
111  static void TearDownTestCase() {
112  for (auto &pix : pix_) {
113  pix.destroy();
114  }
115  }
116 
117 #ifdef INCLUDE_TENSORFLOW
118  void ResetPool() {
119  pool_.reset(
120  new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
121  }
122 
123  void WaitForPoolWorkers() {
124  pool_.reset(nullptr);
125  }
126 
127  std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
128  static int pool_size_;
129 #endif
130  static std::vector<Image > pix_;
131  static std::vector<std::string> langs_;
132  static std::vector<std::string> gt_text_;
133  static int num_langs_;
134 };
135 
136 // static member variable declarations.
137 #ifdef INCLUDE_TENSORFLOW
138 int BaseapiThreadTest::pool_size_;
139 #endif
140 std::vector<Image > BaseapiThreadTest::pix_;
141 std::vector<std::string> BaseapiThreadTest::langs_;
142 std::vector<std::string> BaseapiThreadTest::gt_text_;
144 
145 static void InitTessInstance(TessBaseAPI *tess, const std::string &lang) {
146  CHECK(tess != nullptr);
147  EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));
148 }
149 
150 static void GetCleanedText(TessBaseAPI *tess, Image pix, std::string &ocr_text) {
151  tess->SetImage(pix);
152  char *result = tess->GetUTF8Text();
153  ocr_text = result;
154  delete[] result;
155  trim(ocr_text);
156 }
157 
158 #ifdef INCLUDE_TENSORFLOW
159 static void VerifyTextResult(TessBaseAPI *tess, Image pix, const std::string &lang,
160  const std::string &expected_text) {
161  TessBaseAPI *tess_local = nullptr;
162  if (tess) {
163  tess_local = tess;
164  } else {
165  tess_local = new TessBaseAPI;
166  InitTessInstance(tess_local, lang);
167  }
168  std::string ocr_text;
169  GetCleanedText(tess_local, pix, ocr_text);
170  EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
171  if (tess_local != tess) {
172  delete tess_local;
173  }
174 }
175 #endif
176 
177 // Check that Tesseract/Cube produce the correct results in single-threaded
178 // operation. If not, it is pointless to run the real multi-threaded tests.
179 TEST_F(BaseapiThreadTest, TestBasicSanity) {
180  for (int i = 0; i < num_langs_; ++i) {
181  TessBaseAPI tess;
182  InitTessInstance(&tess, langs_[i]);
183  std::string ocr_text;
184  GetCleanedText(&tess, pix_[i], ocr_text);
185  CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) << "Failed with lang = " << langs_[i];
186  }
187 }
188 
189 // Test concurrent instance initialization.
191 #ifdef INCLUDE_TENSORFLOW
192  const int n = num_langs_ * FLAGS_reps;
193  ResetPool();
194  std::vector<TessBaseAPI> tess(n);
195  for (int i = 0; i < n; ++i) {
196  pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));
197  }
198  WaitForPoolWorkers();
199 #endif
200 }
201 
202 // Test concurrent recognition.
203 TEST_F(BaseapiThreadTest, TestRecognition) {
204 #ifdef INCLUDE_TENSORFLOW
205  const int n = num_langs_ * FLAGS_reps;
206  std::vector<TessBaseAPI> tess(n);
207  // Initialize api instances in a single thread.
208  for (int i = 0; i < n; ++i) {
209  InitTessInstance(&tess[i], langs_[i % num_langs_]);
210  }
211 
212  ResetPool();
213  for (int i = 0; i < n; ++i) {
214  pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], langs_[i % num_langs_],
215  gt_text_[i % num_langs_]));
216  }
217  WaitForPoolWorkers();
218 #endif
219 }
220 
222 #ifdef INCLUDE_TENSORFLOW
223  const int n = num_langs_ * FLAGS_reps;
224  ResetPool();
225  for (int i = 0; i < n; ++i) {
226  pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], langs_[i % num_langs_],
227  gt_text_[i % num_langs_]));
228  }
229  WaitForPoolWorkers();
230 #endif
231 }
232 } // namespace tesseract
struct TessBaseAPI TessBaseAPI
Definition: capi.h:62
INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.")
BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances")
@ LOG
#define CHECK(condition)
Definition: include_gunit.h:76
@ INFO
Definition: log.h:28
TEST_F(EuroText, FastLatinOCR)
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:365
static std::vector< std::string > langs_
static std::vector< Image > pix_
static std::vector< std::string > gt_text_