tesseract  5.0.0
imagedata.cpp
Go to the documentation of this file.
1 // File: imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 // training file and its corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "imagedata.h"
25 
26 #include "boxread.h" // for ReadMemBoxes
27 #include "rect.h" // for TBOX
28 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
29 #include "tprintf.h" // for tprintf
30 
31 #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo
32 #include "serialis.h" // for TFile
33 
34 #include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
35 
36 #include <cinttypes> // for PRId64
37 
38 namespace tesseract {
39 
40 // Number of documents to read ahead while training. Doesn't need to be very
41 // large.
42 const int kMaxReadAhead = 8;
43 
44 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {}
45 // Takes ownership of the pix and destroys it.
46 ImageData::ImageData(bool vertical, Image pix)
47  : page_number_(0), vertical_text_(vertical) {
48  SetPix(pix);
49 }
51 #ifdef TESSERACT_IMAGEDATA_AS_PIX
52  internal_pix_.destroy();
53 #endif
54 }
55 
56 // Builds and returns an ImageData from the basic data. Note that imagedata,
57 // truth_text, and box_text are all the actual file data, NOT filenames.
58 ImageData *ImageData::Build(const char *name, int page_number, const char *lang,
59  const char *imagedata, int imagedatasize,
60  const char *truth_text, const char *box_text) {
61  auto *image_data = new ImageData();
62  image_data->imagefilename_ = name;
63  image_data->page_number_ = page_number;
64  image_data->language_ = lang;
65  // Save the imagedata.
66  // TODO: optimize resize (no init).
67  image_data->image_data_.resize(imagedatasize);
68  memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
69  if (!image_data->AddBoxes(box_text)) {
70  if (truth_text == nullptr || truth_text[0] == '\0') {
71  tprintf("Error: No text corresponding to page %d from image %s!\n",
72  page_number, name);
73  delete image_data;
74  return nullptr;
75  }
76  image_data->transcription_ = truth_text;
77  // If we have no boxes, the transcription is in the 0th box_texts_.
78  image_data->box_texts_.emplace_back(truth_text);
79  // We will create a box for the whole image on PreScale, to save unpacking
80  // the image now.
81  } else if (truth_text != nullptr && truth_text[0] != '\0' &&
82  image_data->transcription_ != truth_text) {
83  // Save the truth text as it is present and disagrees with the box text.
84  image_data->transcription_ = truth_text;
85  }
86  return image_data;
87 }
88 
89 // Writes to the given file. Returns false in case of error.
90 bool ImageData::Serialize(TFile *fp) const {
91  if (!fp->Serialize(imagefilename_)) {
92  return false;
93  }
94  if (!fp->Serialize(&page_number_)) {
95  return false;
96  }
97  if (!fp->Serialize(image_data_)) {
98  return false;
99  }
100  if (!fp->Serialize(language_)) {
101  return false;
102  }
103  if (!fp->Serialize(transcription_)) {
104  return false;
105  }
106  if (!fp->Serialize(boxes_)) {
107  return false;
108  }
109  if (!fp->Serialize(box_texts_)) {
110  return false;
111  }
112  int8_t vertical = vertical_text_;
113  return fp->Serialize(&vertical);
114 }
115 
116 // Reads from the given file. Returns false in case of error.
118  if (!fp->DeSerialize(imagefilename_)) {
119  return false;
120  }
121  if (!fp->DeSerialize(&page_number_)) {
122  return false;
123  }
124  if (!fp->DeSerialize(image_data_)) {
125  return false;
126  }
127  if (!fp->DeSerialize(language_)) {
128  return false;
129  }
130  if (!fp->DeSerialize(transcription_)) {
131  return false;
132  }
133  if (!fp->DeSerialize(boxes_)) {
134  return false;
135  }
136  if (!fp->DeSerialize(box_texts_)) {
137  return false;
138  }
139  int8_t vertical = 0;
140  if (!fp->DeSerialize(&vertical)) {
141  return false;
142  }
143  vertical_text_ = vertical != 0;
144  return true;
145 }
146 
147 // As DeSerialize, but only seeks past the data - hence a static method.
149  if (!fp->DeSerializeSkip()) {
150  return false;
151  }
152  int32_t page_number;
153  if (!fp->DeSerialize(&page_number)) {
154  return false;
155  }
156  if (!fp->DeSerializeSkip()) {
157  return false;
158  }
159  if (!fp->DeSerializeSkip()) {
160  return false;
161  }
162  if (!fp->DeSerializeSkip()) {
163  return false;
164  }
165  if (!fp->DeSerializeSkip(sizeof(TBOX))) {
166  return false;
167  }
168  int32_t number;
169  if (!fp->DeSerialize(&number)) {
170  return false;
171  }
172  for (int i = 0; i < number; i++) {
173  if (!fp->DeSerializeSkip()) {
174  return false;
175  }
176  }
177  int8_t vertical = 0;
178  return fp->DeSerialize(&vertical);
179 }
180 
181 // Saves the given Pix as a PNG-encoded string and destroys it.
182 // In case of missing PNG support in Leptonica use PNM format,
183 // which requires more memory.
185 #ifdef TESSERACT_IMAGEDATA_AS_PIX
186  internal_pix_ = pix;
187 #else
188  SetPixInternal(pix, &image_data_);
189 #endif
190 }
191 
192 // Returns the Pix image for *this. Must be pixDestroyed after use.
194 #ifdef TESSERACT_IMAGEDATA_AS_PIX
195 # ifdef GRAPHICS_DISABLED
196  /* The only caller of this is the scaling functions to prescale the
197  * source. Thus we can just return a new pointer to the same data. */
198  return internal_pix_.clone();
199 # else
200  /* pixCopy always does an actual copy, so the caller can modify the
201  * changed data. */
202  return internal_pix_.copy();
203 # endif
204 #else
205  return GetPixInternal(image_data_);
206 #endif
207 }
208 
209 // Gets anything and everything with a non-nullptr pointer, prescaled to a
210 // given target_height (if 0, then the original image height), and aligned.
211 // Also returns (if not nullptr) the width and height of the scaled image.
212 // The return value is the scaled Pix, which must be pixDestroyed after use,
213 // and scale_factor (if not nullptr) is set to the scale factor that was applied
214 // to the image to achieve the target_height.
215 Image ImageData::PreScale(int target_height, int max_height,
216  float *scale_factor, int *scaled_width,
217  int *scaled_height, std::vector<TBOX> *boxes) const {
218  int input_width = 0;
219  int input_height = 0;
220  Image src_pix = GetPix();
221  ASSERT_HOST(src_pix != nullptr);
222  input_width = pixGetWidth(src_pix);
223  input_height = pixGetHeight(src_pix);
224  if (target_height == 0) {
225  target_height = std::min(input_height, max_height);
226  }
227  float im_factor = static_cast<float>(target_height) / input_height;
228  if (scaled_width != nullptr) {
229  *scaled_width = IntCastRounded(im_factor * input_width);
230  }
231  if (scaled_height != nullptr) {
232  *scaled_height = target_height;
233  }
234  // Get the scaled image.
235  Image pix = pixScale(src_pix, im_factor, im_factor);
236  if (pix == nullptr) {
237  tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
238  input_width, input_height, im_factor);
239  src_pix.destroy();
240  return nullptr;
241  }
242  if (scaled_width != nullptr) {
243  *scaled_width = pixGetWidth(pix);
244  }
245  if (scaled_height != nullptr) {
246  *scaled_height = pixGetHeight(pix);
247  }
248  src_pix.destroy();
249  if (boxes != nullptr) {
250  // Get the boxes.
251  boxes->clear();
252  for (auto box : boxes_) {
253  box.scale(im_factor);
254  boxes->push_back(box);
255  }
256  if (boxes->empty()) {
257  // Make a single box for the whole image.
258  TBOX box(0, 0, im_factor * input_width, target_height);
259  boxes->push_back(box);
260  }
261  }
262  if (scale_factor != nullptr) {
263  *scale_factor = im_factor;
264  }
265  return pix;
266 }
267 
269  return image_data_.size();
270 }
271 
272 #ifndef GRAPHICS_DISABLED
273 
274 // Draws the data in a new window.
275 void ImageData::Display() const {
276  const int kTextSize = 64;
277  // Draw the image.
278  Image pix = GetPix();
279  if (pix == nullptr) {
280  return;
281  }
282  int width = pixGetWidth(pix);
283  int height = pixGetHeight(pix);
284  auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize),
285  2 * (height + 4 * kTextSize), width + 10,
286  height + 3 * kTextSize, true);
287  win->Draw(pix, 0, height - 1);
288  pix.destroy();
289  // Draw the boxes.
290  win->Pen(ScrollView::RED);
291  win->Brush(ScrollView::NONE);
292  int text_size = kTextSize;
293  if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {
294  text_size = boxes_[0].height() * 2;
295  }
296  win->TextAttributes("Arial", text_size, false, false, false);
297  if (!boxes_.empty()) {
298  for (unsigned b = 0; b < boxes_.size(); ++b) {
299  boxes_[b].plot(win);
300  win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
301  }
302  } else {
303  // The full transcription.
304  win->Pen(ScrollView::CYAN);
305  win->Text(0, height + kTextSize * 2, transcription_.c_str());
306  }
307  win->Update();
308  win->Wait();
309 }
310 
311 #endif
312 
313 // Adds the supplied boxes and transcriptions that correspond to the correct
314 // page number.
315 void ImageData::AddBoxes(const std::vector<TBOX> &boxes,
316  const std::vector<std::string> &texts,
317  const std::vector<int> &box_pages) {
318  // Copy the boxes and make the transcription.
319  for (unsigned i = 0; i < box_pages.size(); ++i) {
320  if (page_number_ >= 0 && box_pages[i] != page_number_) {
321  continue;
322  }
323  transcription_ += texts[i];
324  boxes_.push_back(boxes[i]);
325  box_texts_.push_back(texts[i]);
326  }
327 }
328 
329 #ifndef TESSERACT_IMAGEDATA_AS_PIX
330 // Saves the given Pix as a PNG-encoded string and destroys it.
331 // In case of missing PNG support in Leptonica use PNM format,
332 // which requires more memory.
333 void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) {
334  l_uint8 *data;
335  size_t size;
336  l_int32 ret;
337  ret = pixWriteMem(&data, &size, pix, IFF_PNG);
338  if (ret) {
339  ret = pixWriteMem(&data, &size, pix, IFF_PNM);
340  }
341  pix.destroy();
342  // TODO: optimize resize (no init).
343  image_data->resize(size);
344  memcpy(&(*image_data)[0], data, size);
345  lept_free(data);
346 }
347 
348 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
349 Image ImageData::GetPixInternal(const std::vector<char> &image_data) {
350  Image pix = nullptr;
351  if (!image_data.empty()) {
352  // Convert the array to an image.
353  const auto *u_data =
354  reinterpret_cast<const unsigned char *>(&image_data[0]);
355  pix = pixReadMem(u_data, image_data.size());
356  }
357  return pix;
358 }
359 #endif
360 
361 // Parses the text string as a box file and adds any discovered boxes that
362 // match the page number. Returns false on error.
363 bool ImageData::AddBoxes(const char *box_text) {
364  if (box_text != nullptr && box_text[0] != '\0') {
365  std::vector<TBOX> boxes;
366  std::vector<std::string> texts;
367  std::vector<int> box_pages;
368  if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
369  /*continue_on_failure*/ true, &boxes, &texts, nullptr,
370  &box_pages)) {
371  AddBoxes(boxes, texts, box_pages);
372  return true;
373  } else {
374  tprintf("Error: No boxes for page %d from image %s!\n", page_number_,
375  imagefilename_.c_str());
376  }
377  }
378  return false;
379 }
380 
381 DocumentData::DocumentData(const std::string &name)
382  : document_name_(name),
383  pages_offset_(-1),
384  total_pages_(-1),
385  memory_used_(0),
386  max_memory_(0),
387  reader_(nullptr) {}
388 
390  if (thread.joinable()) {
391  thread.join();
392  }
393  std::lock_guard<std::mutex> lock_p(pages_mutex_);
394  std::lock_guard<std::mutex> lock_g(general_mutex_);
395  for (auto data : pages_) {
396  delete data;
397  }
398 }
399 
400 // Reads all the pages in the given lstmf filename to the cache. The reader
401 // is used to read the file.
402 bool DocumentData::LoadDocument(const char *filename, int start_page,
403  int64_t max_memory, FileReader reader) {
404  SetDocument(filename, max_memory, reader);
405  pages_offset_ = start_page;
406  return ReCachePages();
407 }
408 
409 // Sets up the document, without actually loading it.
410 void DocumentData::SetDocument(const char *filename, int64_t max_memory,
411  FileReader reader) {
412  std::lock_guard<std::mutex> lock_p(pages_mutex_);
413  std::lock_guard<std::mutex> lock(general_mutex_);
414  document_name_ = filename;
415  pages_offset_ = -1;
416  max_memory_ = max_memory;
417  reader_ = reader;
418 }
419 
420 // Writes all the pages to the given filename. Returns false on error.
421 bool DocumentData::SaveDocument(const char *filename, FileWriter writer) {
422  std::lock_guard<std::mutex> lock(pages_mutex_);
423  TFile fp;
424  fp.OpenWrite(nullptr);
425  if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {
426  tprintf("Serialize failed: %s\n", filename);
427  return false;
428  }
429  return true;
430 }
431 
432 // Adds the given page data to this document, counting up memory.
434  std::lock_guard<std::mutex> lock(pages_mutex_);
435  pages_.push_back(page);
436  set_memory_used(memory_used() + page->MemoryUsed());
437 }
438 
439 // If the given index is not currently loaded, loads it using a separate
440 // thread.
442  ImageData *page = nullptr;
443  if (IsPageAvailable(index, &page)) {
444  return;
445  }
446  {
447  std::lock_guard<std::mutex> lock(pages_mutex_);
448  if (pages_offset_ == index) {
449  return;
450  }
451  pages_offset_ = index;
452  for (auto page : pages_) {
453  delete page;
454  }
455  pages_.clear();
456  }
457  if (thread.joinable()) {
458  thread.join();
459  }
460  // Don't run next statement asynchronously because that would
461  // create too many threads on Linux (see issue #3111).
462  ReCachePages();
463 }
464 
465 // Returns a pointer to the page with the given index, modulo the total
466 // number of pages. Blocks until the background load is completed.
467 const ImageData *DocumentData::GetPage(int index) {
468  ImageData *page = nullptr;
469  while (!IsPageAvailable(index, &page)) {
470  // If there is no background load scheduled, schedule one now.
471  pages_mutex_.lock();
472  bool needs_loading = pages_offset_ != index;
473  pages_mutex_.unlock();
474  if (needs_loading) {
475  LoadPageInBackground(index);
476  }
477  // We can't directly load the page, or the background load will delete it
478  // while the caller is using it, so give it a chance to work.
479  std::this_thread::yield();
480  }
481  return page;
482 }
483 
484 // Returns true if the requested page is available, and provides a pointer,
485 // which may be nullptr if the document is empty. May block, even though it
486 // doesn't guarantee to return true.
487 bool DocumentData::IsPageAvailable(int index, ImageData **page) {
488  std::lock_guard<std::mutex> lock(pages_mutex_);
489  int num_pages = NumPages();
490  if (num_pages == 0 || index < 0) {
491  *page = nullptr; // Empty Document.
492  return true;
493  }
494  if (num_pages > 0) {
495  index = Modulo(index, num_pages);
496  if (pages_offset_ <= index &&
497  static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
498  *page = pages_[index - pages_offset_]; // Page is available already.
499  return true;
500  }
501  }
502  return false;
503 }
504 
505 // Removes all pages from memory and frees the memory, but does not forget
506 // the document metadata.
508  std::lock_guard<std::mutex> lock(pages_mutex_);
509  int64_t memory_saved = memory_used();
510  for (auto page : pages_) {
511  delete page;
512  }
513  pages_.clear();
514  pages_offset_ = -1;
515  set_total_pages(-1);
516  set_memory_used(0);
517  tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
518  document_name_.c_str(), memory_saved);
519  return memory_saved;
520 }
521 
522 // Shuffles all the pages in the document.
524  TRand random;
525  // Different documents get shuffled differently, but the same for the same
526  // name.
527  random.set_seed(document_name_.c_str());
528  int num_pages = pages_.size();
529  // Execute one random swap for each page in the document.
530  for (int i = 0; i < num_pages; ++i) {
531  int src = random.IntRand() % num_pages;
532  int dest = random.IntRand() % num_pages;
533  std::swap(pages_[src], pages_[dest]);
534  }
535 }
536 
537 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
538 // starting at index pages_offset_.
539 bool DocumentData::ReCachePages() {
540  std::lock_guard<std::mutex> lock(pages_mutex_);
541  // Read the file.
542  set_total_pages(0);
543  set_memory_used(0);
544  int loaded_pages = 0;
545  for (auto page : pages_) {
546  delete page;
547  }
548  pages_.clear();
549  TFile fp;
550  if (!fp.Open(document_name_.c_str(), reader_) ||
551  !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {
552  tprintf("Deserialize header failed: %s\n", document_name_.c_str());
553  return false;
554  }
555  pages_offset_ %= loaded_pages;
556  // Skip pages before the first one we want, and load the rest until max
557  // memory and skip the rest after that.
558  int page;
559  for (page = 0; page < loaded_pages; ++page) {
560  uint8_t non_null;
561  if (!fp.DeSerialize(&non_null)) {
562  break;
563  }
564  if (page < pages_offset_ ||
565  (max_memory_ > 0 && memory_used() > max_memory_)) {
566  if (non_null && !ImageData::SkipDeSerialize(&fp)) {
567  break;
568  }
569  } else {
570  ImageData *image_data = nullptr;
571  if (non_null) {
572  image_data = new ImageData;
573  if (!image_data->DeSerialize(&fp)) {
574  delete image_data;
575  break;
576  }
577  }
578  pages_.push_back(image_data);
579  if (image_data->imagefilename().empty()) {
580  image_data->set_imagefilename(document_name_);
581  image_data->set_page_number(page);
582  }
583  set_memory_used(memory_used() + image_data->MemoryUsed());
584  }
585  }
586  if (page < loaded_pages) {
587  tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(),
588  page, loaded_pages);
589  for (auto page : pages_) {
590  delete page;
591  }
592  pages_.clear();
593  } else if (loaded_pages > 1) {
594  // Avoid lots of messages for training with single line images.
595  tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(),
596  loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
597  document_name_.c_str());
598  }
599  set_total_pages(loaded_pages);
600  return !pages_.empty();
601 }
602 
603 // A collection of DocumentData that knows roughly how much memory it is using.
604 DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {}
605 
607  for (auto *document : documents_) {
608  delete document;
609  }
610 }
611 
612 // Adds all the documents in the list of filenames, counting memory.
613 // The reader is used to read the files.
614 bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames,
615  CachingStrategy cache_strategy,
616  FileReader reader) {
617  cache_strategy_ = cache_strategy;
618  int64_t fair_share_memory = 0;
619  // In the round-robin case, each DocumentData handles restricting its content
620  // to its fair share of memory. In the sequential case, DocumentCache
621  // determines which DocumentDatas are held entirely in memory.
622  if (cache_strategy_ == CS_ROUND_ROBIN) {
623  fair_share_memory = max_memory_ / filenames.size();
624  }
625  for (const auto &filename : filenames) {
626  auto *document = new DocumentData(filename);
627  document->SetDocument(filename.c_str(), fair_share_memory, reader);
628  AddToCache(document);
629  }
630  if (!documents_.empty()) {
631  // Try to get the first page now to verify the list of filenames.
632  if (GetPageBySerial(0) != nullptr) {
633  return true;
634  }
635  tprintf("Load of page 0 failed!\n");
636  }
637  return false;
638 }
639 
640 // Adds document to the cache.
642  documents_.push_back(data);
643  return true;
644 }
645 
646 // Finds and returns a document by name.
648  const std::string &document_name) const {
649  for (auto *document : documents_) {
650  if (document->document_name() == document_name) {
651  return document;
652  }
653  }
654  return nullptr;
655 }
656 
657 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
658 // strategy, could take a long time.
660  if (cache_strategy_ == CS_SEQUENTIAL) {
661  // In sequential mode, we assume each doc has the same number of pages
662  // whether it is true or not.
663  if (num_pages_per_doc_ == 0) {
664  GetPageSequential(0);
665  }
666  return num_pages_per_doc_ * documents_.size();
667  }
668  int total_pages = 0;
669  for (auto *document : documents_) {
670  // We have to load a page to make NumPages() valid.
671  document->GetPage(0);
672  total_pages += document->NumPages();
673  }
674  return total_pages;
675 }
676 
677 // Returns a page by serial number, selecting them in a round-robin fashion
678 // from all the documents. Highly disk-intensive, but doesn't need samples
679 // to be shuffled between files to begin with.
680 const ImageData *DocumentCache::GetPageRoundRobin(int serial) {
681  int num_docs = documents_.size();
682  int doc_index = serial % num_docs;
683  const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);
684  for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
685  doc_index = (serial + offset) % num_docs;
686  int page = (serial + offset) / num_docs;
687  documents_[doc_index]->LoadPageInBackground(page);
688  }
689  return doc;
690 }
691 
692 // Returns a page by serial number, selecting them in sequence from each file.
693 // Requires the samples to be shuffled between the files to give a random or
694 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
695 const ImageData *DocumentCache::GetPageSequential(int serial) {
696  int num_docs = documents_.size();
697  ASSERT_HOST(num_docs > 0);
698  if (num_pages_per_doc_ == 0) {
699  // Use the pages in the first doc as the number of pages in each doc.
700  documents_[0]->GetPage(0);
701  num_pages_per_doc_ = documents_[0]->NumPages();
702  if (num_pages_per_doc_ == 0) {
703  tprintf("First document cannot be empty!!\n");
704  ASSERT_HOST(num_pages_per_doc_ > 0);
705  }
706  // Get rid of zero now if we don't need it.
707  if (serial / num_pages_per_doc_ % num_docs > 0) {
708  documents_[0]->UnCache();
709  }
710  }
711  int doc_index = serial / num_pages_per_doc_ % num_docs;
712  const ImageData *doc =
713  documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
714  // Count up total memory. Background loading makes it more complicated to
715  // keep a running count.
716  int64_t total_memory = 0;
717  for (auto *document : documents_) {
718  total_memory += document->memory_used();
719  }
720  if (total_memory >= max_memory_) {
721  // Find something to un-cache.
722  // If there are more than 3 in front, then serial is from the back reader
723  // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
724  // we create a hole between them and then un-caching the backmost occupied
725  // will work for both.
726  int num_in_front = CountNeighbourDocs(doc_index, 1);
727  for (int offset = num_in_front - 2;
728  offset > 1 && total_memory >= max_memory_; --offset) {
729  int next_index = (doc_index + offset) % num_docs;
730  total_memory -= documents_[next_index]->UnCache();
731  }
732  // If that didn't work, the best solution is to un-cache from the back. If
733  // we take away the document that a 2nd reader is using, it will put it
734  // back and make a hole between.
735  int num_behind = CountNeighbourDocs(doc_index, -1);
736  for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
737  ++offset) {
738  int next_index = (doc_index + offset + num_docs) % num_docs;
739  total_memory -= documents_[next_index]->UnCache();
740  }
741  }
742  int next_index = (doc_index + 1) % num_docs;
743  if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
744  documents_[next_index]->LoadPageInBackground(0);
745  }
746  return doc;
747 }
748 
749 // Helper counts the number of adjacent cached neighbours of index looking in
750 // direction dir, ie index+dir, index+2*dir etc.
751 int DocumentCache::CountNeighbourDocs(int index, int dir) {
752  int num_docs = documents_.size();
753  for (int offset = dir; abs(offset) < num_docs; offset += dir) {
754  int offset_index = (index + offset + num_docs) % num_docs;
755  if (!documents_[offset_index]->IsCached()) {
756  return offset - dir;
757  }
758  }
759  return num_docs;
760 }
761 
762 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
const int kMaxReadAhead
Definition: imagedata.cpp:42
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:90
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:48
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:63
CachingStrategy
Definition: imagedata.h:42
@ CS_SEQUENTIAL
Definition: imagedata.h:49
@ CS_ROUND_ROBIN
Definition: imagedata.h:54
int Modulo(int a, int b)
Definition: helpers.h:158
Image copy() const
Definition: image.cpp:28
Image clone() const
Definition: image.cpp:24
void destroy()
Definition: image.cpp:32
void AddBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, const std::vector< int > &box_pages)
Definition: imagedata.cpp:315
int page_number() const
Definition: imagedata.h:89
static bool SkipDeSerialize(TFile *fp)
Definition: imagedata.cpp:148
void SetPix(Image pix)
Definition: imagedata.cpp:184
Image GetPix() const
Definition: imagedata.cpp:193
void Display() const
Definition: imagedata.cpp:275
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:90
const std::vector< char > & image_data() const
Definition: imagedata.h:95
bool DeSerialize(TFile *fp)
Definition: imagedata.cpp:117
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:58
const std::string & box_text(int index) const
Definition: imagedata.h:113
int MemoryUsed() const
Definition: imagedata.cpp:268
const std::vector< TBOX > & boxes() const
Definition: imagedata.h:107
Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, std::vector< TBOX > *boxes) const
Definition: imagedata.cpp:215
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:487
TESS_API DocumentData(const std::string &name)
Definition: imagedata.cpp:381
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:410
int64_t memory_used() const
Definition: imagedata.h:201
TESS_API bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:421
void LoadPageInBackground(int index)
Definition: imagedata.cpp:441
int NumPages() const
Definition: imagedata.h:194
TESS_API bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:402
TESS_API const ImageData * GetPage(int index)
Definition: imagedata.cpp:467
TESS_API void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:433
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:641
DocumentData * FindDocument(const std::string &document_name) const
Definition: imagedata.cpp:647
TESS_API bool LoadDocuments(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:614
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:317
TESS_API int TotalPages()
Definition: imagedata.cpp:659
TESS_API DocumentCache(int64_t max_memory)
Definition: imagedata.cpp:604
int32_t IntRand()
Definition: helpers.h:72
void set_seed(uint64_t seed)
Definition: helpers.h:62
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
bool DeSerializeSkip(size_t size=1)
Definition: serialis.cpp:86
bool CloseWrite(const char *filename, FileWriter writer)
Definition: serialis.cpp:263