"Fossies" - the Fresh Open Source Software Archive

Member "tesseract-5.2.0/src/ccmain/applybox.cpp" (6 Jul 2022, 32831 Bytes) of package /linux/misc/tesseract-5.2.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "applybox.cpp" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 5.0.1_vs_5.1.0.

    1 /**********************************************************************
    2  * File:        applybox.cpp  (Formerly applybox.c)
    3  * Description: Re segment rows according to box file data
    4  * Author:      Phil Cheatle
    5  *
    6  * (C) Copyright 1993, Hewlett-Packard Ltd.
    7  ** Licensed under the Apache License, Version 2.0 (the "License");
    8  ** you may not use this file except in compliance with the License.
    9  ** You may obtain a copy of the License at
   10  ** http://www.apache.org/licenses/LICENSE-2.0
   11  ** Unless required by applicable law or agreed to in writing, software
   12  ** distributed under the License is distributed on an "AS IS" BASIS,
   13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14  ** See the License for the specific language governing permissions and
   15  ** limitations under the License.
   16  *
   17  **********************************************************************/
   18 
   19 #ifndef DISABLED_LEGACY_ENGINE
   20 #  include <allheaders.h>
   21 #  include <cctype>
   22 #  include <cerrno>
   23 #  include <cstring>
   24 #  include "boxread.h"
   25 #endif // ndef DISABLED_LEGACY_ENGINE
   26 #include <tesseract/unichar.h>
   27 #include "pageres.h"
   28 #include "tesseractclass.h"
   29 #include "unicharset.h"
   30 
   31 #ifndef DISABLED_LEGACY_ENGINE
   32 /** Max number of blobs to classify together in FindSegmentation. */
   33 const int kMaxGroupSize = 4;
   34 /// Max fraction of median allowed as deviation in xheight before switching
   35 /// to median.
   36 const double kMaxXHeightDeviationFraction = 0.125;
   37 #endif // ndef DISABLED_LEGACY_ENGINE
   38 
   39 /**
   40  * The box file is assumed to contain box definitions, one per line, of the
   41  * following format for blob-level boxes:
   42  * @verbatim
   43  *   <UTF8 str> <left> <bottom> <right> <top> <page id>
   44  * @endverbatim
   45  * and for word/line-level boxes:
   46  * @verbatim
   47  *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
   48  * @endverbatim
   49  * NOTES:
   50  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
   51  *
   52  * <page id> is 0-based, and the page number is used for multipage input (tiff).
   53  *
   54  * In the blob-level form, each line represents a recognizable unit, which may
   55  * be several UTF-8 bytes, but there is a bounding box around each recognizable
   56  * unit, and no classifier is needed to train in this mode (bootstrapping.)
   57  *
   58  * In the word/line-level form, the line begins with the literal "WordStr", and
   59  * the bounding box bounds either a whole line or a whole word. The recognizable
   60  * units in the word/line are listed after the # at the end of the line and
   61  * are space delimited, ignoring any original spaces on the line.
   62  * Eg.
   63  * @verbatim
   64  * word -> #w o r d
   65  * multi word line -> #m u l t i w o r d l i n e
   66  * @endverbatim
   67  * The recognizable units must be space-delimited in order to allow multiple
   68  * unicodes to be used for a single recognizable unit, eg Hindi.
   69  *
   70  * In this mode, the classifier must have been pre-trained with the desired
   71  * character set, or it will not be able to find the character segmentations.
   72  */
   73 
   74 namespace tesseract {
   75 
   76 #ifndef DISABLED_LEGACY_ENGINE
   77 static void clear_any_old_text(BLOCK_LIST *block_list) {
   78   BLOCK_IT block_it(block_list);
   79   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
   80     ROW_IT row_it(block_it.data()->row_list());
   81     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
   82       WERD_IT word_it(row_it.data()->word_list());
   83       for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
   84         word_it.data()->set_text("");
   85       }
   86     }
   87   }
   88 }
   89 
   90 // Applies the box file based on the image name filename, and resegments
   91 // the words in the block_list (page), with:
   92 // blob-mode: one blob per line in the box file, words as input.
   93 // word/line-mode: one blob per space-delimited unit after the #, and one word
   94 // per line in the box file. (See comment above for box file format.)
   95 // If find_segmentation is true, (word/line mode) then the classifier is used
   96 // to re-segment words/lines to match the space-delimited truth string for
   97 // each box. In this case, the input box may be for a word or even a whole
   98 // text line, and the output words will contain multiple blobs corresponding
   99 // to the space-delimited input string.
  100 // With find_segmentation false, no classifier is needed, but the chopper
  101 // can still be used to correctly segment touching characters with the help
  102 // of the input boxes.
  103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
  104 // from normal classification, ie. with a word, chopped_word, rebuild_word,
  105 // seam_array, denorm, box_word, and best_state, but NO best_choice or
  106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
  107 // Instead, the correct_text member of WERD_RES is set, and this may be later
  108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
  109 // is not required before calling ApplyBoxTraining.
  110 PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
  111                                 BLOCK_LIST *block_list) {
  112   std::vector<TBOX> boxes;
  113   std::vector<std::string> texts, full_texts;
  114   if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
  115     return nullptr; // Can't do it.
  116   }
  117 
  118   const int box_count = boxes.size();
  119   int box_failures = 0;
  120 
  121   // In word mode, we use the boxes to make a word for each box, but
  122   // in blob mode we use the existing words and maximally chop them first.
  123   PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
  124   clear_any_old_text(block_list);
  125 
  126   for (int i = 0; i < box_count; i++) {
  127     bool foundit = false;
  128     if (page_res != nullptr) {
  129       foundit =
  130           ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
  131                            (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
  132     } else {
  133       foundit = ResegmentWordBox(block_list, boxes[i],
  134                                  (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
  135     }
  136     if (!foundit) {
  137       box_failures++;
  138       ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
  139     }
  140   }
  141 
  142   if (page_res == nullptr) {
  143     // In word/line mode, we now maximally chop all the words and resegment
  144     // them with the classifier.
  145     page_res = SetupApplyBoxes(boxes, block_list);
  146     ReSegmentByClassification(page_res);
  147   }
  148   if (applybox_debug > 0) {
  149     tprintf("APPLY_BOXES:\n");
  150     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
  151     if (box_failures > 0) {
  152       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
  153     }
  154   }
  155   TidyUp(page_res);
  156   return page_res;
  157 }
  158 
  159 // Helper computes median xheight in the image.
  160 static double MedianXHeight(BLOCK_LIST *block_list) {
  161   BLOCK_IT block_it(block_list);
  162   STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);
  163   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
  164     ROW_IT row_it(block_it.data()->row_list());
  165     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
  166       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
  167     }
  168   }
  169   return xheights.median();
  170 }
  171 
  172 /// Any row xheight that is significantly different from the median is set
  173 /// to the median.
  174 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
  175   const double median_xheight = MedianXHeight(block_list);
  176   const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
  177   // Strip all fuzzy space markers to simplify the PAGE_RES.
  178   BLOCK_IT b_it(block_list);
  179   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
  180     BLOCK *block = b_it.data();
  181     ROW_IT r_it(block->row_list());
  182     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
  183       ROW *row = r_it.data();
  184       const double diff = fabs(row->x_height() - median_xheight);
  185       if (diff > max_deviation) {
  186         if (applybox_debug) {
  187           tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
  188         }
  189         row->set_x_height(static_cast<float>(median_xheight));
  190       }
  191     }
  192   }
  193 }
  194 
  195 /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
  196 /// All fuzzy spaces are removed, and all the words are maximally chopped.
  197 PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
  198   PreenXHeights(block_list);
  199   // Strip all fuzzy space markers to simplify the PAGE_RES.
  200   BLOCK_IT b_it(block_list);
  201   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
  202     BLOCK *block = b_it.data();
  203     ROW_IT r_it(block->row_list());
  204     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
  205       ROW *row = r_it.data();
  206       WERD_IT w_it(row->word_list());
  207       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
  208         WERD *word = w_it.data();
  209         if (word->cblob_list()->empty()) {
  210           delete w_it.extract();
  211         } else {
  212           word->set_flag(W_FUZZY_SP, false);
  213           word->set_flag(W_FUZZY_NON, false);
  214         }
  215       }
  216     }
  217   }
  218   auto *page_res = new PAGE_RES(false, block_list, nullptr);
  219   PAGE_RES_IT pr_it(page_res);
  220   WERD_RES *word_res;
  221   while ((word_res = pr_it.word()) != nullptr) {
  222     MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
  223     pr_it.forward();
  224   }
  225   return page_res;
  226 }
  227 
  228 /// Tests the chopper by exhaustively running chop_one_blob.
  229 /// The word_res will contain filled chopped_word, seam_array, denorm,
  230 /// box_word and best_state for the maximally chopped word.
  231 void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
  232                                   WERD_RES *word_res) {
  233   if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
  234                                      classify_bln_numeric_mode, textord_use_cjk_fp_model,
  235                                      poly_allow_detailed_fx, row, block)) {
  236     word_res->CloneChoppedToRebuild();
  237     return;
  238   }
  239   if (chop_debug) {
  240     tprintf("Maximally chopping word at:");
  241     word_res->word->bounding_box().print();
  242   }
  243   std::vector<BLOB_CHOICE *> blob_choices;
  244   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
  245   auto rating = static_cast<float>(INT8_MAX);
  246   for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
  247     // The rating and certainty are not quite arbitrary. Since
  248     // select_blob_to_chop uses the worst certainty to choose, they all have
  249     // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
  250     // in here, and then divide by e each time they are chopped, which
  251     // should guarantee a set of unequal values for the whole tree of blobs
  252     // produced, however much chopping is required. The chops are thus only
  253     // limited by the ability of the chopper to find suitable chop points,
  254     // and not by the value of the certainties.
  255     auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
  256     blob_choices.push_back(choice);
  257     rating -= 0.125f;
  258   }
  259   const double e = exp(1.0); // The base of natural logs.
  260   unsigned blob_number;
  261   int right_chop_index = 0;
  262   if (!assume_fixed_pitch_char_segment) {
  263     // We only chop if the language is not fixed pitch like CJK.
  264     SEAM *seam = nullptr;
  265     while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
  266       word_res->InsertSeam(blob_number, seam);
  267       BLOB_CHOICE *left_choice = blob_choices[blob_number];
  268       rating = left_choice->rating() / e;
  269       left_choice->set_rating(rating);
  270       left_choice->set_certainty(-rating);
  271       // combine confidence w/ serial #
  272       auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
  273                                            0.0f, 0.0f, BCC_FAKE);
  274       blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
  275     }
  276   }
  277   word_res->CloneChoppedToRebuild();
  278   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
  279 }
  280 
  281 /// Helper to compute the dispute resolution metric.
  282 /// Disputed blob resolution. The aim is to give the blob to the most
  283 /// appropriate boxfile box. Most of the time it is obvious, but if
  284 /// two boxfile boxes overlap significantly it is not. If a small boxfile
  285 /// box takes most of the blob, and a large boxfile box does too, then
  286 /// we want the small boxfile box to get it, but if the small box
  287 /// is much smaller than the blob, we don't want it to get it.
  288 /// Details of the disputed blob resolution:
  289 /// Given a box with area A, and a blob with area B, with overlap area C,
  290 /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
  291 /// miss metric gets the blob.
  292 static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
  293   const int overlap_area = box1.intersection(box2).area();
  294   const int a = box1.area();
  295   const int b = box2.area();
  296   ASSERT_HOST(a != 0 && b != 0);
  297   return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
  298 }
  299 
  300 /// Gather consecutive blobs that match the given box into the best_state
  301 /// and corresponding correct_text.
  302 ///
  303 /// Fights over which box owns which blobs are settled by pre-chopping and
  304 /// applying the blobs to box or next_box with the least non-overlap.
  305 /// @return false if the box was in error, which can only be caused by
  306 /// failing to find an appropriate blob for a box.
  307 ///
  308 /// This means that occasionally, blobs may be incorrectly segmented if the
  309 /// chopper fails to find a suitable chop point.
  310 bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
  311                                  const TBOX *next_box, const char *correct_text) {
  312   if (applybox_debug > 1) {
  313     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
  314   }
  315   PAGE_RES_IT page_res_it(page_res);
  316   WERD_RES *word_res;
  317   for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
  318     if (!word_res->box_word->bounding_box().major_overlap(box)) {
  319       continue;
  320     }
  321     if (applybox_debug > 1) {
  322       tprintf("Checking word box:");
  323       word_res->box_word->bounding_box().print();
  324     }
  325     int word_len = word_res->box_word->length();
  326     for (int i = 0; i < word_len; ++i) {
  327       TBOX char_box = TBOX();
  328       int blob_count = 0;
  329       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
  330         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
  331         if (!blob_box.major_overlap(box)) {
  332           break;
  333         }
  334         if (word_res->correct_text[i + blob_count].length() > 0) {
  335           break; // Blob is claimed already.
  336         }
  337         if (next_box != nullptr) {
  338           const double current_box_miss_metric = BoxMissMetric(blob_box, box);
  339           const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
  340           if (applybox_debug > 2) {
  341             tprintf("Checking blob:");
  342             blob_box.print();
  343             tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
  344                     next_box_miss_metric);
  345           }
  346           if (current_box_miss_metric > next_box_miss_metric) {
  347             break; // Blob is a better match for next box.
  348           }
  349         }
  350         char_box += blob_box;
  351       }
  352       if (blob_count > 0) {
  353         if (applybox_debug > 1) {
  354           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
  355         }
  356         if (!char_box.almost_equal(box, 3) &&
  357             ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
  358              (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
  359           return false;
  360         }
  361         // We refine just the box_word, best_state and correct_text here.
  362         // The rebuild_word is made in TidyUp.
  363         // blob_count blobs are put together to match the box. Merge the
  364         // box_word boxes, save the blob_count in the state and the text.
  365         word_res->box_word->MergeBoxes(i, i + blob_count);
  366         word_res->best_state[i] = blob_count;
  367         word_res->correct_text[i] = correct_text;
  368         if (applybox_debug > 2) {
  369           tprintf("%d Blobs match: blob box:", blob_count);
  370           word_res->box_word->BlobBox(i).print();
  371           tprintf("Matches box:");
  372           box.print();
  373           if (next_box != nullptr) {
  374             tprintf("With next box:");
  375             next_box->print();
  376           }
  377         }
  378         // Eliminated best_state and correct_text entries for the consumed
  379         // blobs.
  380         for (int j = 1; j < blob_count; ++j) {
  381           word_res->best_state.erase(word_res->best_state.begin() + i + 1);
  382           word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
  383         }
  384         // Assume that no box spans multiple source words, so we are done with
  385         // this box.
  386         if (applybox_debug > 1) {
  387           tprintf("Best state = ");
  388           for (auto best_state : word_res->best_state) {
  389             tprintf("%d ", best_state);
  390           }
  391           tprintf("\n");
  392           tprintf("Correct text = [[ ");
  393           for (auto &it : word_res->correct_text) {
  394             tprintf("%s ", it.c_str());
  395           }
  396           tprintf("]]\n");
  397         }
  398         return true;
  399       }
  400     }
  401   }
  402   if (applybox_debug > 0) {
  403     tprintf("FAIL!\n");
  404   }
  405   return false; // Failure.
  406 }
  407 
  408 /// Consume all source blobs that strongly overlap the given box,
  409 /// putting them into a new word, with the correct_text label.
  410 /// Fights over which box owns which blobs are settled by
  411 /// applying the blobs to box or next_box with the least non-overlap.
  412 /// @return false if the box was in error, which can only be caused by
  413 /// failing to find an overlapping blob for a box.
  414 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
  415                                  const char *correct_text) {
  416   if (applybox_debug > 1) {
  417     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  418   }
  419   WERD *new_word = nullptr;
  420   BLOCK_IT b_it(block_list);
  421   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
  422     BLOCK *block = b_it.data();
  423     if (!box.major_overlap(block->pdblk.bounding_box())) {
  424       continue;
  425     }
  426     ROW_IT r_it(block->row_list());
  427     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
  428       ROW *row = r_it.data();
  429       if (!box.major_overlap(row->bounding_box())) {
  430         continue;
  431       }
  432       WERD_IT w_it(row->word_list());
  433       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
  434         WERD *word = w_it.data();
  435         if (applybox_debug > 2) {
  436           tprintf("Checking word:");
  437           word->bounding_box().print();
  438         }
  439         if (word->text() != nullptr && word->text()[0] != '\0') {
  440           continue; // Ignore words that are already done.
  441         }
  442         if (!box.major_overlap(word->bounding_box())) {
  443           continue;
  444         }
  445         C_BLOB_IT blob_it(word->cblob_list());
  446         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
  447           C_BLOB *blob = blob_it.data();
  448           TBOX blob_box = blob->bounding_box();
  449           if (!blob_box.major_overlap(box)) {
  450             continue;
  451           }
  452           if (next_box != nullptr) {
  453             const double current_box_miss_metric = BoxMissMetric(blob_box, box);
  454             const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
  455             if (applybox_debug > 2) {
  456               tprintf("Checking blob:");
  457               blob_box.print();
  458               tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
  459                       next_box_miss_metric);
  460             }
  461             if (current_box_miss_metric > next_box_miss_metric) {
  462               continue; // Blob is a better match for next box.
  463             }
  464           }
  465           if (applybox_debug > 2) {
  466             tprintf("Blob match: blob:");
  467             blob_box.print();
  468             tprintf("Matches box:");
  469             box.print();
  470             if (next_box != nullptr) {
  471               tprintf("With next box:");
  472               next_box->print();
  473             }
  474           }
  475           if (new_word == nullptr) {
  476             // Make a new word with a single blob.
  477             new_word = word->shallow_copy();
  478             new_word->set_text(correct_text);
  479             w_it.add_to_end(new_word);
  480           }
  481           C_BLOB_IT new_blob_it(new_word->cblob_list());
  482           new_blob_it.add_to_end(blob_it.extract());
  483         }
  484       }
  485     }
  486   }
  487   if (new_word == nullptr && applybox_debug > 0) {
  488     tprintf("FAIL!\n");
  489   }
  490   return new_word != nullptr;
  491 }
  492 
  493 /// Resegments the words by running the classifier in an attempt to find the
  494 /// correct segmentation that produces the required string.
  495 void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
  496   PAGE_RES_IT pr_it(page_res);
  497   WERD_RES *word_res;
  498   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
  499     const WERD *word = word_res->word;
  500     if (word->text() == nullptr || word->text()[0] == '\0') {
  501       continue; // Ignore words that have no text.
  502     }
  503     // Convert the correct text to a vector of UNICHAR_ID
  504     std::vector<UNICHAR_ID> target_text;
  505     if (!ConvertStringToUnichars(word->text(), &target_text)) {
  506       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
  507       pr_it.DeleteCurrentWord();
  508       continue;
  509     }
  510     if (!FindSegmentation(target_text, word_res)) {
  511       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
  512       pr_it.DeleteCurrentWord();
  513       continue;
  514     }
  515   }
  516 }
  517 
  518 /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
  519 /// @return false if an invalid UNICHAR_ID is encountered.
  520 bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
  521   for (int step = 0; *utf8 != '\0'; utf8 += step) {
  522     const char *next_space = strchr(utf8, ' ');
  523     if (next_space == nullptr) {
  524       next_space = utf8 + strlen(utf8);
  525     }
  526     step = next_space - utf8;
  527     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
  528     if (class_id == INVALID_UNICHAR_ID) {
  529       return false;
  530     }
  531     while (utf8[step] == ' ') {
  532       ++step;
  533     }
  534     class_ids->push_back(class_id);
  535   }
  536   return true;
  537 }
  538 
  539 /// Resegments the word to achieve the target_text from the classifier.
  540 /// Returns false if the re-segmentation fails.
  541 /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
  542 /// applies a full search on the classifier results to find the best classified
  543 /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
  544 /// substitutions ARE used.
  545 bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
  546   // Classify all required combinations of blobs and save results in choices.
  547   const int word_length = word_res->box_word->length();
  548   auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
  549   for (int i = 0; i < word_length; ++i) {
  550     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
  551       BLOB_CHOICE_LIST *match_result =
  552           classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
  553                          word_res->blamer_bundle);
  554       if (applybox_debug > 2) {
  555         tprintf("%d+%d:", i, j);
  556         print_ratings_list("Segment:", match_result, unicharset);
  557       }
  558       choices[i].push_back(match_result);
  559     }
  560   }
  561   // Search the segmentation graph for the target text. Must be an exact
  562   // match. Using wildcards makes it difficult to find the correct
  563   // segmentation even when it is there.
  564   word_res->best_state.clear();
  565   std::vector<int> search_segmentation;
  566   float best_rating = 0.0f;
  567   SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
  568                 &word_res->best_state);
  569   for (int i = 0; i < word_length; ++i) {
  570     for (auto choice : choices[i]) {
  571       delete choice;
  572     }
  573   }
  574   delete[] choices;
  575   if (word_res->best_state.empty()) {
  576     // Build the original segmentation and if it is the same length as the
  577     // truth, assume it will do.
  578     int blob_count = 1;
  579     for (auto s : word_res->seam_array) {
  580       SEAM *seam = s;
  581       if (!seam->HasAnySplits()) {
  582         word_res->best_state.push_back(blob_count);
  583         blob_count = 1;
  584       } else {
  585         ++blob_count;
  586       }
  587     }
  588     word_res->best_state.push_back(blob_count);
  589     if (word_res->best_state.size() != target_text.size()) {
  590       word_res->best_state.clear(); // No good. Original segmentation bad size.
  591       return false;
  592     }
  593   }
  594   word_res->correct_text.clear();
  595   for (auto &text : target_text) {
  596     word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
  597   }
  598   return true;
  599 }
  600 
  601 /// Recursive helper to find a match to the target_text (from text_index
  602 /// position) in the choices (from choices_pos position).
  603 /// @param choices is an array of vectors of length choices_length,
  604 /// with each element representing a starting position in the word, and the
  605 /// #vector holding classification results for a sequence of consecutive
  606 /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
  607 /// @param choices_pos
  608 /// @param choices_length
  609 /// @param target_text
  610 /// @param text_index
  611 /// @param rating
  612 /// @param segmentation
  613 /// @param best_rating
  614 /// @param best_segmentation
  615 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
  616                               unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
  617                               unsigned text_index, float rating, std::vector<int> *segmentation,
  618                               float *best_rating, std::vector<int> *best_segmentation) {
  619   const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
  620   for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
  621     // Rating of matching choice or worst choice if no match.
  622     float choice_rating = 0.0f;
  623     // Find the corresponding best BLOB_CHOICE.
  624     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
  625     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
  626       const BLOB_CHOICE *choice = choice_it.data();
  627       choice_rating = choice->rating();
  628       auto class_id = choice->unichar_id();
  629       if (class_id == target_text[text_index]) {
  630         break;
  631       }
  632       // Search ambigs table.
  633       if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
  634         AmbigSpec_IT spec_it(table[class_id]);
  635         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
  636           const AmbigSpec *ambig_spec = spec_it.data();
  637           // We'll only do 1-1.
  638           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
  639               ambig_spec->correct_ngram_id == target_text[text_index]) {
  640             break;
  641           }
  642         }
  643         if (!spec_it.cycled_list()) {
  644           break; // Found an ambig.
  645         }
  646       }
  647     }
  648     if (choice_it.cycled_list()) {
  649       continue; // No match.
  650     }
  651     segmentation->push_back(length);
  652     if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
  653       // This is a complete match. If the rating is good record a new best.
  654       if (applybox_debug > 2) {
  655         tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
  656                 rating + choice_rating, *best_rating, segmentation->size(),
  657                 best_segmentation->size());
  658       }
  659       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
  660         *best_segmentation = *segmentation;
  661         *best_rating = rating + choice_rating;
  662       }
  663     } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
  664       if (applybox_debug > 3) {
  665         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
  666                 unicharset.id_to_unichar(target_text[text_index]),
  667                 choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
  668                 choices_pos, length);
  669       }
  670       SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
  671                     rating + choice_rating, segmentation, best_rating, best_segmentation);
  672       if (applybox_debug > 3) {
  673         tprintf("End recursion for %d=%s\n", target_text[text_index],
  674                 unicharset.id_to_unichar(target_text[text_index]));
  675       }
  676     }
  677     segmentation->resize(segmentation->size() - 1);
  678   }
  679 }
  680 
  681 /// - Counts up the labelled words and the blobs within.
  682 /// - Deletes all unused or emptied words, counting the unused ones.
  683 /// - Resets W_BOL and W_EOL flags correctly.
  684 /// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
  685 void Tesseract::TidyUp(PAGE_RES *page_res) {
  686   int ok_blob_count = 0;
  687   int bad_blob_count = 0;
  688   int ok_word_count = 0;
  689   int unlabelled_words = 0;
  690   PAGE_RES_IT pr_it(page_res);
  691   WERD_RES *word_res;
  692   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
  693     int ok_in_word = 0;
  694     int blob_count = word_res->correct_text.size();
  695     auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
  696     word_choice->set_permuter(TOP_CHOICE_PERM);
  697     for (int c = 0; c < blob_count; ++c) {
  698       if (word_res->correct_text[c].length() > 0) {
  699         ++ok_in_word;
  700       }
  701       // Since we only need a fake word_res->best_choice, the actual
  702       // unichar_ids do not matter. Which is fortunate, since TidyUp()
  703       // can be called while training Tesseract, at the stage where
  704       // unicharset is not meaningful yet.
  705       word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
  706                                                      1.0f, -1.0f);
  707     }
  708     if (ok_in_word > 0) {
  709       ok_blob_count += ok_in_word;
  710       bad_blob_count += word_res->correct_text.size() - ok_in_word;
  711       word_res->LogNewRawChoice(word_choice);
  712       word_res->LogNewCookedChoice(1, false, word_choice);
  713     } else {
  714       ++unlabelled_words;
  715       if (applybox_debug > 0) {
  716         tprintf("APPLY_BOXES: Unlabelled word at :");
  717         word_res->word->bounding_box().print();
  718       }
  719       pr_it.DeleteCurrentWord();
  720       delete word_choice;
  721     }
  722   }
  723   pr_it.restart_page();
  724   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
  725     // Denormalize back to a BoxWord.
  726     word_res->RebuildBestState();
  727     word_res->SetupBoxWord();
  728     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
  729     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
  730   }
  731   if (applybox_debug > 0) {
  732     tprintf("   Found %d good blobs.\n", ok_blob_count);
  733     if (bad_blob_count > 0) {
  734       tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
  735     }
  736     if (unlabelled_words > 0) {
  737       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
  738     }
  739   }
  740 }
  741 
  742 /** Logs a bad box by line in the box file and box coords.*/
  743 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
  744                                 const char *err_msg) {
  745   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
  746           box.left(), box.bottom(), box.right(), box.top(), err_msg);
  747 }
  748 
  749 /// Calls #LearnWord to extract features for labelled blobs within each word.
  750 /// Features are stored in an internal buffer.
  751 void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
  752   PAGE_RES_IT pr_it(page_res);
  753   int word_count = 0;
  754   for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
  755     LearnWord(fontname.c_str(), word_res);
  756     ++word_count;
  757   }
  758   tprintf("Generated training data for %d words\n", word_count);
  759 }
  760 
  761 #endif // ndef DISABLED_LEGACY_ENGINE
  762 
  763 /** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
  764 void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
  765   PAGE_RES_IT pr_it(page_res);
  766   for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
  767     auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
  768     for (auto &correct_text : word_res->correct_text) {
  769       // The part before the first space is the real ground truth, and the
  770       // rest is the bounding box location and page number.
  771       std::vector<std::string> tokens = split(correct_text, ' ');
  772       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
  773       choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
  774     }
  775     word_res->ClearWordChoices();
  776     word_res->LogNewRawChoice(choice);
  777     word_res->LogNewCookedChoice(1, false, choice);
  778   }
  779 }
  780 
  781 } // namespace tesseract