tesseract  5.0.0
imagedata.h
Go to the documentation of this file.
1 // File: imagedata.h
3 // Description: Class to hold information about a single image and its
4 // corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
20 #define TESSERACT_IMAGE_IMAGEDATA_H_
21 
22 #include "image.h"
23 #include "points.h" // for FCOORD
24 
25 #include <mutex> // for std::mutex
26 #include <thread> // for std::thread
27 
28 struct Pix;
29 
30 namespace tesseract {
31 
32 class TFile;
33 class ScrollView;
34 class TBOX;
35 
36 // Amount of padding to apply in output pixels in feature mode.
37 const int kFeaturePadding = 2;
38 // Number of pixels to pad around text boxes.
39 const int kImagePadding = 4;
40 
41 // Enum to determine the caching and data sequencing strategy.
43  // Reads all of one file before moving on to the next. Requires samples to be
44  // shuffled across files. Uses the count of samples in the first file as
45  // the count in all the files to achieve high-speed random access. As a
46  // consequence, if subsequent files are smaller, they get entries used more
47  // than once, and if subsequent files are larger, some entries are not used.
48  // Best for larger data sets that don't fit in memory.
50  // Reads one sample from each file in rotation. Does not require shuffled
51  // samples, but is extremely disk-intensive. Samples in smaller files also
52  // get used more often than samples in larger files.
53  // Best for smaller data sets that mostly fit in memory.
55 };
56 
57 // Class to hold information on a single image:
58 // Filename, cached image as a Pix*, character boxes, text transcription.
59 // The text transcription is the ground truth UTF-8 text for the image.
60 // Character boxes are optional and indicate the desired segmentation of
61 // the text into recognition units.
63 public:
64  ImageData();
65  // Takes ownership of the pix.
66  ImageData(bool vertical, Image pix);
67  ~ImageData();
68 
69  // Builds and returns an ImageData from the basic data. Note that imagedata,
70  // truth_text, and box_text are all the actual file data, NOT filenames.
71  static ImageData *Build(const char *name, int page_number, const char *lang,
72  const char *imagedata, int imagedatasize, const char *truth_text,
73  const char *box_text);
74 
75  // Writes to the given file. Returns false in case of error.
76  bool Serialize(TFile *fp) const;
77  // Reads from the given file. Returns false in case of error.
78  bool DeSerialize(TFile *fp);
79  // As DeSerialize, but only seeks past the data - hence a static method.
80  static bool SkipDeSerialize(TFile *fp);
81 
82  // Other accessors.
83  const std::string &imagefilename() const {
84  return imagefilename_;
85  }
86  void set_imagefilename(const std::string &name) {
87  imagefilename_ = name;
88  }
89  int page_number() const {
90  return page_number_;
91  }
92  void set_page_number(int num) {
93  page_number_ = num;
94  }
95  const std::vector<char> &image_data() const {
96  return image_data_;
97  }
98  const std::string &language() const {
99  return language_;
100  }
101  void set_language(const std::string &lang) {
102  language_ = lang;
103  }
104  const std::string &transcription() const {
105  return transcription_;
106  }
107  const std::vector<TBOX> &boxes() const {
108  return boxes_;
109  }
110  const std::vector<std::string> &box_texts() const {
111  return box_texts_;
112  }
113  const std::string &box_text(int index) const {
114  return box_texts_[index];
115  }
116  // Saves the given Pix as a PNG-encoded string and destroys it.
117  // In case of missing PNG support in Leptonica use PNM format,
118  // which requires more memory.
119  void SetPix(Image pix);
120  // Returns the Pix image for *this. Must be pixDestroyed after use.
121  Image GetPix() const;
122  // Gets anything and everything with a non-nullptr pointer, prescaled to a
123  // given target_height (if 0, then the original image height), and aligned.
124  // Also returns (if not nullptr) the width and height of the scaled image.
125  // The return value is the scaled Pix, which must be pixDestroyed after use,
126  // and scale_factor (if not nullptr) is set to the scale factor that was
127  // applied to the image to achieve the target_height.
128  Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width,
129  int *scaled_height, std::vector<TBOX> *boxes) const;
130 
131  int MemoryUsed() const;
132 
133  // Draws the data in a new window.
134  void Display() const;
135 
136  // Adds the supplied boxes and transcriptions that correspond to the correct
137  // page number.
138  void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
139  const std::vector<int> &box_pages);
140 
141 private:
142  // Saves the given Pix as a PNG-encoded string and destroys it.
143  // In case of missing PNG support in Leptonica use PNM format,
144  // which requires more memory.
145  static void SetPixInternal(Image pix, std::vector<char> *image_data);
146  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
147  static Image GetPixInternal(const std::vector<char> &image_data);
148  // Parses the text string as a box file and adds any discovered boxes that
149  // match the page number. Returns false on error.
150  bool AddBoxes(const char *box_text);
151 
152 private:
153  std::string imagefilename_; // File to read image from.
154  int32_t page_number_; // Page number if multi-page tif or -1.
155  // see https://github.com/tesseract-ocr/tesseract/pull/2965
156  // EP: reconsider for tess6.0/opencv
157 #ifdef TESSERACT_IMAGEDATA_AS_PIX
158  Image internal_pix_;
159 #endif
160  std::vector<char> image_data_; // PNG/PNM file data.
161  std::string language_; // Language code for image.
162  std::string transcription_; // UTF-8 ground truth of image.
163  std::vector<TBOX> boxes_; // If non-empty boxes of the image.
164  std::vector<std::string> box_texts_; // String for text in each box.
165  bool vertical_text_; // Image has been rotated from vertical.
166 };
167 
168 // A collection of ImageData that knows roughly how much memory it is using.
170 public:
171  TESS_API
172  explicit DocumentData(const std::string &name);
173  TESS_API
174  ~DocumentData();
175 
176  // Reads all the pages in the given lstmf filename to the cache. The reader
177  // is used to read the file.
178  TESS_API
179  bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);
180  // Sets up the document, without actually loading it.
181  void SetDocument(const char *filename, int64_t max_memory, FileReader reader);
182  // Writes all the pages to the given filename. Returns false on error.
183  TESS_API
184  bool SaveDocument(const char *filename, FileWriter writer);
185 
186  // Adds the given page data to this document, counting up memory.
187  TESS_API
188  void AddPageToDocument(ImageData *page);
189 
190  const std::string &document_name() const {
191  std::lock_guard<std::mutex> lock(general_mutex_);
192  return document_name_;
193  }
194  int NumPages() const {
195  std::lock_guard<std::mutex> lock(general_mutex_);
196  return total_pages_;
197  }
198  size_t PagesSize() const {
199  return pages_.size();
200  }
201  int64_t memory_used() const {
202  std::lock_guard<std::mutex> lock(general_mutex_);
203  return memory_used_;
204  }
205  // If the given index is not currently loaded, loads it using a separate
206  // thread. Note: there are 4 cases:
207  // Document uncached: IsCached() returns false, total_pages_ < 0.
208  // Required page is available: IsPageAvailable returns true. In this case,
209  // total_pages_ > 0 and
210  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
211  // Pages are loaded, but the required one is not.
212  // The requested page is being loaded by LoadPageInBackground. In this case,
213  // index == pages_offset_. Once the loading starts, the pages lock is held
214  // until it completes, at which point IsPageAvailable will unblock and return
215  // true.
216  void LoadPageInBackground(int index);
217  // Returns a pointer to the page with the given index, modulo the total
218  // number of pages. Blocks until the background load is completed.
219  TESS_API
220  const ImageData *GetPage(int index);
221  // Returns true if the requested page is available, and provides a pointer,
222  // which may be nullptr if the document is empty. May block, even though it
223  // doesn't guarantee to return true.
224  bool IsPageAvailable(int index, ImageData **page);
225  // Takes ownership of the given page index. The page is made nullptr in *this.
226  ImageData *TakePage(int index) {
227  std::lock_guard<std::mutex> lock(pages_mutex_);
228  ImageData *page = pages_[index];
229  pages_[index] = nullptr;
230  return page;
231  }
232  // Returns true if the document is currently loaded or in the process of
233  // loading.
234  bool IsCached() const {
235  return NumPages() >= 0;
236  }
237  // Removes all pages from memory and frees the memory, but does not forget
238  // the document metadata. Returns the memory saved.
239  int64_t UnCache();
240  // Shuffles all the pages in the document.
241  void Shuffle();
242 
243 private:
244  // Sets the value of total_pages_ behind a mutex.
245  void set_total_pages(int total) {
246  std::lock_guard<std::mutex> lock(general_mutex_);
247  total_pages_ = total;
248  }
249  void set_memory_used(int64_t memory_used) {
250  std::lock_guard<std::mutex> lock(general_mutex_);
251  memory_used_ = memory_used;
252  }
253  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
254  // starting at index pages_offset_.
255  bool ReCachePages();
256 
257 private:
258  // A name for this document.
259  std::string document_name_;
260  // A group of pages that corresponds in some loose way to a document.
261  std::vector<ImageData *> pages_;
262  // Page number of the first index in pages_.
263  int pages_offset_;
264  // Total number of pages in document (may exceed size of pages_.)
265  int total_pages_;
266  // Total of all pix sizes in the document.
267  int64_t memory_used_;
268  // Max memory to use at any time.
269  int64_t max_memory_;
270  // Saved reader from LoadDocument to allow re-caching.
271  FileReader reader_;
272  // Mutex that protects pages_ and pages_offset_ against multiple parallel
273  // loads, and provides a wait for page.
274  std::mutex pages_mutex_;
275  // Mutex that protects other data members that callers want to access without
276  // waiting for a load operation.
277  mutable std::mutex general_mutex_;
278 
279  // Thread which loads document.
280  std::thread thread;
281 };
282 
283 // A collection of DocumentData that knows roughly how much memory it is using.
284 // Note that while it supports background read-ahead, it assumes that a single
285 // thread is accessing documents, ie it is not safe for multiple threads to
286 // access different documents in parallel, as one may de-cache the other's
287 // content.
289 public:
290  TESS_API
291  explicit DocumentCache(int64_t max_memory);
292  TESS_API
293  ~DocumentCache();
294 
295  // Deletes all existing documents from the cache.
296  void Clear() {
297  for (auto *document : documents_) {
298  delete document;
299  }
300  documents_.clear();
301  num_pages_per_doc_ = 0;
302  }
303  // Adds all the documents in the list of filenames, counting memory.
304  // The reader is used to read the files.
305  TESS_API
306  bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,
307  FileReader reader);
308 
309  // Adds document to the cache.
310  bool AddToCache(DocumentData *data);
311 
312  // Finds and returns a document by name.
313  DocumentData *FindDocument(const std::string &document_name) const;
314 
315  // Returns a page by serial number using the current cache_strategy_ to
316  // determine the mapping from serial number to page.
317  const ImageData *GetPageBySerial(int serial) {
318  if (cache_strategy_ == CS_SEQUENTIAL) {
319  return GetPageSequential(serial);
320  } else {
321  return GetPageRoundRobin(serial);
322  }
323  }
324 
325  const std::vector<DocumentData *> &documents() const {
326  return documents_;
327  }
328  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
329  // strategy, could take a long time.
330  TESS_API
331  int TotalPages();
332 
333 private:
334  // Returns a page by serial number, selecting them in a round-robin fashion
335  // from all the documents. Highly disk-intensive, but doesn't need samples
336  // to be shuffled between files to begin with.
337  TESS_API
338  const ImageData *GetPageRoundRobin(int serial);
339  // Returns a page by serial number, selecting them in sequence from each file.
340  // Requires the samples to be shuffled between the files to give a random or
341  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
342  TESS_API
343  const ImageData *GetPageSequential(int serial);
344 
345  // Helper counts the number of adjacent cached neighbour documents_ of index
346  // looking in direction dir, ie index+dir, index+2*dir etc.
347  int CountNeighbourDocs(int index, int dir);
348 
349  // A group of pages that corresponds in some loose way to a document.
350  std::vector<DocumentData *> documents_;
351  // Strategy to use for caching and serializing data samples.
352  CachingStrategy cache_strategy_ = CS_SEQUENTIAL;
353  // Number of pages in the first document, used as a divisor in
354  // GetPageSequential to determine the document index.
355  int num_pages_per_doc_ = 0;
356  // Max memory allowed in this cache.
357  int64_t max_memory_ = 0;
358 };
359 
360 } // namespace tesseract
361 
362 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
@ TBOX
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:48
const int kImagePadding
Definition: imagedata.h:39
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:220
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
const int kFeaturePadding
Definition: imagedata.h:37
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:63
CachingStrategy
Definition: imagedata.h:42
@ CS_SEQUENTIAL
Definition: imagedata.h:49
@ CS_ROUND_ROBIN
Definition: imagedata.h:54
int page_number() const
Definition: imagedata.h:89
void set_imagefilename(const std::string &name)
Definition: imagedata.h:86
void set_page_number(int num)
Definition: imagedata.h:92
const std::string & transcription() const
Definition: imagedata.h:104
const std::string & language() const
Definition: imagedata.h:98
const std::vector< std::string > & box_texts() const
Definition: imagedata.h:110
const std::vector< char > & image_data() const
Definition: imagedata.h:95
const std::string & imagefilename() const
Definition: imagedata.h:83
void set_language(const std::string &lang)
Definition: imagedata.h:101
const std::string & box_text(int index) const
Definition: imagedata.h:113
const std::vector< TBOX > & boxes() const
Definition: imagedata.h:107
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:487
TESS_API DocumentData(const std::string &name)
Definition: imagedata.cpp:381
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:410
int64_t memory_used() const
Definition: imagedata.h:201
TESS_API bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:421
void LoadPageInBackground(int index)
Definition: imagedata.cpp:441
int NumPages() const
Definition: imagedata.h:194
bool IsCached() const
Definition: imagedata.h:234
ImageData * TakePage(int index)
Definition: imagedata.h:226
const std::string & document_name() const
Definition: imagedata.h:190
size_t PagesSize() const
Definition: imagedata.h:198
TESS_API bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:402
TESS_API const ImageData * GetPage(int index)
Definition: imagedata.cpp:467
TESS_API void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:433
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:641
DocumentData * FindDocument(const std::string &document_name) const
Definition: imagedata.cpp:647
const std::vector< DocumentData * > & documents() const
Definition: imagedata.h:325
TESS_API bool LoadDocuments(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:614
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:317
TESS_API int TotalPages()
Definition: imagedata.cpp:659
TESS_API DocumentCache(int64_t max_memory)
Definition: imagedata.cpp:604
#define TESS_API
Definition: export.h:34