"Fossies" - the Fresh Open Source Software Archive

Member "tesseract-5.2.0/src/ccutil/tessdatamanager.cpp" (6 Jul 2022, 10555 Bytes) of package /linux/misc/tesseract-5.2.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "tessdatamanager.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.1.0_vs_5.2.0.

    1 ///////////////////////////////////////////////////////////////////////
    2 // File:        tessdatamanager.cpp
    3 // Description: Functions to handle loading/combining tesseract data files.
    4 // Author:      Daria Antonova
    5 //
    6 // (C) Copyright 2009, Google Inc.
    7 // Licensed under the Apache License, Version 2.0 (the "License");
    8 // you may not use this file except in compliance with the License.
    9 // You may obtain a copy of the License at
   10 // http://www.apache.org/licenses/LICENSE-2.0
   11 // Unless required by applicable law or agreed to in writing, software
   12 // distributed under the License is distributed on an "AS IS" BASIS,
   13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14 // See the License for the specific language governing permissions and
   15 // limitations under the License.
   16 //
   17 ///////////////////////////////////////////////////////////////////////
   18 
   19 #ifdef HAVE_CONFIG_H
   20 #  include "config_auto.h"
   21 #endif
   22 
   23 #include "tessdatamanager.h"
   24 
   25 #include <cstdio>
   26 #include <string>
   27 
   28 #if defined(HAVE_LIBARCHIVE)
   29 #  include <archive.h>
   30 #  include <archive_entry.h>
   31 #endif
   32 
   33 #include <tesseract/version.h>
   34 #include "errcode.h"
   35 #include "helpers.h"
   36 #include "params.h"
   37 #include "serialis.h"
   38 #include "tprintf.h"
   39 
   40 namespace tesseract {
   41 
   42 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
   43   SetVersionString(TESSERACT_VERSION_STR);
   44 }
   45 
   46 TessdataManager::TessdataManager(FileReader reader)
   47     : reader_(reader), is_loaded_(false), swap_(false) {
   48   SetVersionString(TESSERACT_VERSION_STR);
   49 }
   50 
   51 // Lazily loads from the given filename. Won't actually read the file
   52 // until it needs it.
   53 void TessdataManager::LoadFileLater(const char *data_file_name) {
   54   Clear();
   55   data_file_name_ = data_file_name;
   56 }
   57 
   58 #if defined(HAVE_LIBARCHIVE)
   59 bool TessdataManager::LoadArchiveFile(const char *filename) {
   60   bool result = false;
   61   archive *a = archive_read_new();
   62   if (a != nullptr) {
   63     archive_read_support_filter_all(a);
   64     archive_read_support_format_all(a);
   65     if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
   66       archive_entry *ae;
   67       while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
   68         const char *component = archive_entry_pathname(ae);
   69         if (component != nullptr) {
   70           TessdataType type;
   71           if (TessdataTypeFromFileName(component, &type)) {
   72             int64_t size = archive_entry_size(ae);
   73             if (size > 0) {
   74               entries_[type].resize(size);
   75               if (archive_read_data(a, &entries_[type][0], size) == size) {
   76                 is_loaded_ = true;
   77               }
   78             }
   79           }
   80         }
   81       }
   82       result = is_loaded_;
   83     }
   84     archive_read_free(a);
   85   }
   86   return result;
   87 }
   88 #endif
   89 
   90 bool TessdataManager::Init(const char *data_file_name) {
   91   std::vector<char> data;
   92   if (reader_ == nullptr) {
   93 #if defined(HAVE_LIBARCHIVE)
   94     if (LoadArchiveFile(data_file_name)) {
   95       return true;
   96     }
   97 #endif
   98     if (!LoadDataFromFile(data_file_name, &data)) {
   99       return false;
  100     }
  101   } else {
  102     if (!(*reader_)(data_file_name, &data)) {
  103       return false;
  104     }
  105   }
  106   return LoadMemBuffer(data_file_name, &data[0], data.size());
  107 }
  108 
  109 // Loads from the given memory buffer as if a file.
  110 bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) {
  111   // TODO: This method supports only the proprietary file format.
  112   Clear();
  113   data_file_name_ = name;
  114   TFile fp;
  115   fp.Open(data, size);
  116   uint32_t num_entries;
  117   if (!fp.DeSerialize(&num_entries)) {
  118     return false;
  119   }
  120   swap_ = num_entries > kMaxNumTessdataEntries;
  121   fp.set_swap(swap_);
  122   if (swap_) {
  123     ReverseN(&num_entries, sizeof(num_entries));
  124   }
  125   if (num_entries > kMaxNumTessdataEntries) {
  126     return false;
  127   }
  128   // TODO: optimize (no init required).
  129   std::vector<int64_t> offset_table(num_entries);
  130   if (!fp.DeSerialize(&offset_table[0], num_entries)) {
  131     return false;
  132   }
  133   for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
  134     if (offset_table[i] >= 0) {
  135       int64_t entry_size = size - offset_table[i];
  136       unsigned j = i + 1;
  137       while (j < num_entries && offset_table[j] == -1) {
  138         ++j;
  139       }
  140       if (j < num_entries) {
  141         entry_size = offset_table[j] - offset_table[i];
  142       }
  143       entries_[i].resize(entry_size);
  144       if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
  145         return false;
  146       }
  147     }
  148   }
  149   if (entries_[TESSDATA_VERSION].empty()) {
  150     SetVersionString("Pre-4.0.0");
  151   }
  152   is_loaded_ = true;
  153   return true;
  154 }
  155 
  156 // Overwrites a single entry of the given type.
  157 void TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) {
  158   is_loaded_ = true;
  159   entries_[type].resize(size);
  160   memcpy(&entries_[type][0], data, size);
  161 }
  162 
  163 // Saves to the given filename.
  164 bool TessdataManager::SaveFile(const char *filename, FileWriter writer) const {
  165   // TODO: This method supports only the proprietary file format.
  166   ASSERT_HOST(is_loaded_);
  167   std::vector<char> data;
  168   Serialize(&data);
  169   if (writer == nullptr) {
  170     return SaveDataToFile(data, filename);
  171   } else {
  172     return (*writer)(data, filename);
  173   }
  174 }
  175 
  176 // Serializes to the given vector.
  177 void TessdataManager::Serialize(std::vector<char> *data) const {
  178   // TODO: This method supports only the proprietary file format.
  179   ASSERT_HOST(is_loaded_);
  180   // Compute the offset_table and total size.
  181   int64_t offset_table[TESSDATA_NUM_ENTRIES];
  182   int64_t offset = sizeof(int32_t) + sizeof(offset_table);
  183   for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
  184     if (entries_[i].empty()) {
  185       offset_table[i] = -1;
  186     } else {
  187       offset_table[i] = offset;
  188       offset += entries_[i].size();
  189     }
  190   }
  191   data->resize(offset, 0);
  192   int32_t num_entries = TESSDATA_NUM_ENTRIES;
  193   TFile fp;
  194   fp.OpenWrite(data);
  195   fp.Serialize(&num_entries);
  196   fp.Serialize(&offset_table[0], countof(offset_table));
  197   for (const auto &entry : entries_) {
  198     if (!entry.empty()) {
  199       fp.Serialize(&entry[0], entry.size());
  200     }
  201   }
  202 }
  203 
  204 // Resets to the initial state, keeping the reader.
  205 void TessdataManager::Clear() {
  206   for (auto &entry : entries_) {
  207     entry.clear();
  208   }
  209   is_loaded_ = false;
  210 }
  211 
  212 // Prints a directory of contents.
  213 void TessdataManager::Directory() const {
  214   tprintf("Version:%s\n", VersionString().c_str());
  215   auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
  216   for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
  217     if (!entries_[i].empty()) {
  218       tprintf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
  219               offset);
  220       offset += entries_[i].size();
  221     }
  222   }
  223 }
  224 
  225 // Opens the given TFile pointer to the given component type.
  226 // Returns false in case of failure.
  227 bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
  228   if (!is_loaded_ && !Init(data_file_name_.c_str())) {
  229     return false;
  230   }
  231   const TessdataManager *const_this = this;
  232   return const_this->GetComponent(type, fp);
  233 }
  234 
  235 // As non-const version except it can't load the component if not already
  236 // loaded.
  237 bool TessdataManager::GetComponent(TessdataType type, TFile *fp) const {
  238   ASSERT_HOST(is_loaded_);
  239   if (entries_[type].empty()) {
  240     return false;
  241   }
  242   fp->Open(&entries_[type][0], entries_[type].size());
  243   fp->set_swap(swap_);
  244   return true;
  245 }
  246 
  247 // Returns the current version string.
  248 std::string TessdataManager::VersionString() const {
  249   return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
  250 }
  251 
  252 // Sets the version string to the given v_str.
  253 void TessdataManager::SetVersionString(const std::string &v_str) {
  254   entries_[TESSDATA_VERSION].resize(v_str.size());
  255   memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
  256 }
  257 
  258 bool TessdataManager::CombineDataFiles(const char *language_data_path_prefix,
  259                                        const char *output_filename) {
  260   // Load individual tessdata components from files.
  261   for (auto filesuffix : kTessdataFileSuffixes) {
  262     TessdataType type;
  263     ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
  264     std::string filename = language_data_path_prefix;
  265     filename += filesuffix;
  266     FILE *fp = fopen(filename.c_str(), "rb");
  267     if (fp != nullptr) {
  268       fclose(fp);
  269       if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
  270         tprintf("Load of file %s failed!\n", filename.c_str());
  271         return false;
  272       }
  273     }
  274   }
  275   is_loaded_ = true;
  276 
  277   // Make sure that the required components are present.
  278   if (!IsBaseAvailable() && !IsLSTMAvailable()) {
  279     tprintf(
  280         "Error: traineddata file must contain at least (a unicharset file"
  281         "and inttemp) OR an lstm file.\n");
  282     return false;
  283   }
  284   // Write updated data to the output traineddata file.
  285   return SaveFile(output_filename, nullptr);
  286 }
  287 
  288 bool TessdataManager::OverwriteComponents(const char *new_traineddata_filename,
  289                                           char **component_filenames, int num_new_components) {
  290   // Open the files with the new components.
  291   // TODO: This method supports only the proprietary file format.
  292   for (int i = 0; i < num_new_components; ++i) {
  293     TessdataType type;
  294     if (TessdataTypeFromFileName(component_filenames[i], &type)) {
  295       if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
  296         tprintf("Failed to read component file:%s\n", component_filenames[i]);
  297         return false;
  298       }
  299     }
  300   }
  301 
  302   // Write updated data to the output traineddata file.
  303   return SaveFile(new_traineddata_filename, nullptr);
  304 }
  305 
  306 bool TessdataManager::ExtractToFile(const char *filename) {
  307   TessdataType type = TESSDATA_NUM_ENTRIES;
  308   ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
  309   if (entries_[type].empty()) {
  310     return false;
  311   }
  312   return SaveDataToFile(entries_[type], filename);
  313 }
  314 
  315 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) {
  316   for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
  317     if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
  318       *type = static_cast<TessdataType>(i);
  319       return true;
  320     }
  321   }
  322 #if !defined(NDEBUG)
  323   tprintf(
  324       "TessdataManager can't determine which tessdata"
  325       " component is represented by %s\n",
  326       suffix);
  327 #endif
  328   return false;
  329 }
  330 
  331 bool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) {
  332   // Get the file suffix (extension)
  333   const char *suffix = strrchr(filename, '.');
  334   if (suffix == nullptr || *(++suffix) == '\0') {
  335     return false;
  336   }
  337   return TessdataTypeFromFileSuffix(suffix, type);
  338 }
  339 
  340 } // namespace tesseract