Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  * Created: Wed Nov 24 09:11:23 GMT 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 
21 #ifdef _MSC_VER
22 #pragma warning(disable:4244) // Conversion warnings
23 #endif
24 
25 #include <ctype.h>
26 #include <string.h>
27 #ifdef __UNIX__
28 #include <assert.h>
29 #include <errno.h>
30 #endif
31 #include "allheaders.h"
32 #include "boxread.h"
33 #include "chopper.h"
34 #include "pageres.h"
35 #include "unichar.h"
36 #include "unicharset.h"
37 #include "tesseractclass.h"
38 #include "genericvector.h"
39 
40 // Max number of blobs to classify together in FindSegmentation.
41 const int kMaxGroupSize = 4;
42 // Max fraction of median allowed as deviation in xheight before switching
43 // to median.
44 const double kMaxXHeightDeviationFraction = 0.125;
45 
46 /*************************************************************************
47  * The box file is assumed to contain box definitions, one per line, of the
48  * following format for blob-level boxes:
49  * <UTF8 str> <left> <bottom> <right> <top> <page id>
50  * and for word/line-level boxes:
51  * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
52  * NOTES:
53  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
54  *
55  * <page id> is 0-based, and the page number is used for multipage input (tiff).
56  *
57  * In the blob-level form, each line represents a recognizable unit, which may
58  * be several UTF-8 bytes, but there is a bounding box around each recognizable
59  * unit, and no classifier is needed to train in this mode (bootstrapping.)
60  *
61  * In the word/line-level form, the line begins with the literal "WordStr", and
62  * the bounding box bounds either a whole line or a whole word. The recognizable
63  * units in the word/line are listed after the # at the end of the line and
64  * are space delimited, ignoring any original spaces on the line.
65  * Eg.
66  * word -> #w o r d
67  * multi word line -> #m u l t i w o r d l i n e
68  * The recognizable units must be space-delimited in order to allow multiple
69  * unicodes to be used for a single recognizable unit, eg Hindi.
70  * In this mode, the classifier must have been pre-trained with the desired
71  * character set, or it will not be able to find the character segmentations.
72  *************************************************************************/
73 
74 namespace tesseract {
75 
76 static void clear_any_old_text(BLOCK_LIST *block_list) {
77  BLOCK_IT block_it(block_list);
78  for (block_it.mark_cycle_pt();
79  !block_it.cycled_list(); block_it.forward()) {
80  ROW_IT row_it(block_it.data()->row_list());
81  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82  WERD_IT word_it(row_it.data()->word_list());
83  for (word_it.mark_cycle_pt();
84  !word_it.cycled_list(); word_it.forward()) {
85  word_it.data()->set_text("");
86  }
87  }
88  }
89 }
90 
91 // Applies the box file based on the image name fname, and resegments
92 // the words in the block_list (page), with:
93 // blob-mode: one blob per line in the box file, words as input.
94 // word/line-mode: one blob per space-delimited unit after the #, and one word
95 // per line in the box file. (See comment above for box file format.)
96 // If find_segmentation is true, (word/line mode) then the classifier is used
97 // to re-segment words/lines to match the space-delimited truth string for
98 // each box. In this case, the input box may be for a word or even a whole
99 // text line, and the output words will contain multiple blobs corresponding
100 // to the space-delimited input string.
101 // With find_segmentation false, no classifier is needed, but the chopper
102 // can still be used to correctly segment touching characters with the help
103 // of the input boxes.
104 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
105 // from normal classification, ie. with a word, chopped_word, rebuild_word,
106 // seam_array, denorm, box_word, and best_state, but NO best_choice or
107 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
108 // Instead, the correct_text member of WERD_RES is set, and this may be later
109 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
110 // is not required before calling ApplyBoxTraining.
112  bool find_segmentation,
113  BLOCK_LIST *block_list) {
114  int box_count = 0;
115  int box_failures = 0;
116 
117  FILE* box_file = OpenBoxFile(fname);
118  TBOX box;
119  GenericVector<TBOX> boxes;
120  GenericVector<STRING> texts, full_texts;
121 
122  bool found_box = true;
123  while (found_box) {
124  int line_number = 0; // Line number of the box file.
125  STRING text, full_text;
126  found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
127  if (found_box) {
128  ++box_count;
129  MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
130  } else {
131  full_text = "";
132  }
133  boxes.push_back(box);
134  texts.push_back(text);
135  full_texts.push_back(full_text);
136  }
137 
138  // In word mode, we use the boxes to make a word for each box, but
139  // in blob mode we use the existing words and maximally chop them first.
140  PAGE_RES* page_res = find_segmentation ?
141  NULL : SetupApplyBoxes(boxes, block_list);
142  clear_any_old_text(block_list);
143 
144  for (int i = 0; i < boxes.size() - 1; i++) {
145  bool foundit = false;
146  if (page_res != NULL) {
147  if (i == 0) {
148  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
149  full_texts[i].string());
150  } else {
151  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
152  boxes[i + 1], full_texts[i].string());
153  }
154  } else {
155  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
156  texts[i].string());
157  }
158  if (!foundit) {
159  box_failures++;
160  ReportFailedBox(i, boxes[i], texts[i].string(),
161  "FAILURE! Couldn't find a matching blob");
162  }
163  }
164 
165  if (page_res == NULL) {
166  // In word/line mode, we now maximally chop all the words and resegment
167  // them with the classifier.
168  page_res = SetupApplyBoxes(boxes, block_list);
169  ReSegmentByClassification(page_res);
170  }
171  if (applybox_debug > 0) {
172  tprintf("APPLY_BOXES:\n");
173  tprintf(" Boxes read from boxfile: %6d\n", box_count);
174  if (box_failures > 0)
175  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
176  }
177  TidyUp(page_res);
178  return page_res;
179 }
180 
181 // Helper computes median xheight in the image.
182 static double MedianXHeight(BLOCK_LIST *block_list) {
183  BLOCK_IT block_it(block_list);
184  STATS xheights(0, block_it.data()->bounding_box().height());
185  for (block_it.mark_cycle_pt();
186  !block_it.cycled_list(); block_it.forward()) {
187  ROW_IT row_it(block_it.data()->row_list());
188  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
189  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
190  }
191  }
192  return xheights.median();
193 }
194 
195 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
196 // All fuzzy spaces are removed, and all the words are maximally chopped.
198  BLOCK_LIST *block_list) {
199  double median_xheight = MedianXHeight(block_list);
200  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
201  // Strip all fuzzy space markers to simplify the PAGE_RES.
202  BLOCK_IT b_it(block_list);
203  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
204  BLOCK* block = b_it.data();
205  ROW_IT r_it(block->row_list());
206  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
207  ROW* row = r_it.data();
208  float diff = fabs(row->x_height() - median_xheight);
209  if (diff > max_deviation) {
210  if (applybox_debug) {
211  tprintf("row xheight=%g, but median xheight = %g\n",
212  row->x_height(), median_xheight);
213  }
214  row->set_x_height(static_cast<float>(median_xheight));
215  }
216  WERD_IT w_it(row->word_list());
217  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
218  WERD* word = w_it.data();
219  if (word->cblob_list()->empty()) {
220  delete w_it.extract();
221  } else {
222  word->set_flag(W_FUZZY_SP, false);
223  word->set_flag(W_FUZZY_NON, false);
224  }
225  }
226  }
227  }
228  PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
229  PAGE_RES_IT pr_it(page_res);
230  WERD_RES* word_res;
231  while ((word_res = pr_it.word()) != NULL) {
232  MaximallyChopWord(boxes, pr_it.block()->block,
233  pr_it.row()->row, word_res);
234  pr_it.forward();
235  }
236  return page_res;
237 }
238 
239 // Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
240 // the top choices. Avoids problems with very long words.
241 static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
242  const UNICHARSET& unicharset,
243  WERD_CHOICE* word_choice) {
244  *word_choice = WERD_CHOICE(&unicharset); // clear the word choice.
245  word_choice->make_bad();
246  for (int i = 0; i < char_choices.size(); ++i) {
247  BLOB_CHOICE_IT it(char_choices[i]);
248  BLOB_CHOICE* bc = it.data();
249  word_choice->append_unichar_id(bc->unichar_id(), 1,
250  bc->rating(), bc->certainty());
251  }
252 }
253 
254 // Tests the chopper by exhaustively running chop_one_blob.
255 // The word_res will contain filled chopped_word, seam_array, denorm,
256 // box_word and best_state for the maximally chopped word.
258  BLOCK* block, ROW* row,
259  WERD_RES* word_res) {
260  if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
262  row, block)) {
263  word_res->CloneChoppedToRebuild();
264  return;
265  }
266  if (chop_debug) {
267  tprintf("Maximally chopping word at:");
268  word_res->word->bounding_box().print();
269  }
271  BLOB_CHOICE_LIST *match_result;
272  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
273  ASSERT_HOST(word_res->chopped_word->blobs != NULL);
274  float rating = static_cast<float>(MAX_INT8);
275  for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
276  blob = blob->next) {
277  // The rating and certainty are not quite arbitrary. Since
278  // select_blob_to_chop uses the worst certainty to choose, they all have
279  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
280  // in here, and then divide by e each time they are chopped, which
281  // should guarantee a set of unequal values for the whole tree of blobs
282  // produced, however much chopping is required. The chops are thus only
283  // limited by the ability of the chopper to find suitable chop points,
284  // and not by the value of the certainties.
285  match_result = fake_classify_blob(0, rating, -rating);
286  modify_blob_choice(match_result, 0);
287  ASSERT_HOST(!match_result->empty());
288  *char_choices += match_result;
289  rating -= 0.125f;
290  }
291  inT32 blob_number;
292  int right_chop_index = 0;
294  // We only chop if the language is not fixed pitch like CJK.
295  if (prioritize_division) {
296  while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
297  } else {
298  while (chop_one_blob(word_res->chopped_word, char_choices,
299  &blob_number, &word_res->seam_array,
300  &right_chop_index));
301  }
302  }
303  MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
304  MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
305  word_res->CloneChoppedToRebuild();
307  if (char_choices != NULL) {
308  char_choices->delete_data_pointers();
309  delete char_choices;
310  }
311 }
312 
313 // Helper to compute the dispute resolution metric.
314 // Disputed blob resolution. The aim is to give the blob to the most
315 // appropriate boxfile box. Most of the time it is obvious, but if
316 // two boxfile boxes overlap significantly it is not. If a small boxfile
317 // box takes most of the blob, and a large boxfile box does too, then
318 // we want the small boxfile box to get it, but if the small box
319 // is much smaller than the blob, we don't want it to get it.
320 // Details of the disputed blob resolution:
321 // Given a box with area A, and a blob with area B, with overlap area C,
322 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
323 // miss metric gets the blob.
324 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
325  int overlap_area = box1.intersection(box2).area();
326  double miss_metric = box1.area()- overlap_area;
327  miss_metric /= box1.area();
328  miss_metric *= box2.area() - overlap_area;
329  miss_metric /= box2.area();
330  return miss_metric;
331 }
332 
333 // Gather consecutive blobs that match the given box into the best_state
334 // and corresponding correct_text.
335 // Fights over which box owns which blobs are settled by pre-chopping and
336 // applying the blobs to box or next_box with the least non-overlap.
337 // Returns false if the box was in error, which can only be caused by
338 // failing to find an appropriate blob for a box.
339 // This means that occasionally, blobs may be incorrectly segmented if the
340 // chopper fails to find a suitable chop point.
341 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
342  const TBOX& box, const TBOX& next_box,
343  const char* correct_text) {
344  if (applybox_debug > 1) {
345  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
346  }
347  PAGE_RES_IT page_res_it(page_res);
348  WERD_RES* word_res;
349  for (word_res = page_res_it.word(); word_res != NULL;
350  word_res = page_res_it.forward()) {
351  if (!word_res->box_word->bounding_box().major_overlap(box))
352  continue;
353  if (applybox_debug > 1) {
354  tprintf("Checking word box:");
355  word_res->box_word->bounding_box().print();
356  }
357  int word_len = word_res->box_word->length();
358  for (int i = 0; i < word_len; ++i) {
359  TBOX char_box = TBOX();
360  int blob_count = 0;
361  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
362  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
363  if (!blob_box.major_overlap(box))
364  break;
365  if (word_res->correct_text[i + blob_count].length() > 0)
366  break; // Blob is claimed already.
367  double current_box_miss_metric = BoxMissMetric(blob_box, box);
368  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
369  if (applybox_debug > 2) {
370  tprintf("Checking blob:");
371  blob_box.print();
372  tprintf("Current miss metric = %g, next = %g\n",
373  current_box_miss_metric, next_box_miss_metric);
374  }
375  if (current_box_miss_metric > next_box_miss_metric)
376  break; // Blob is a better match for next box.
377  char_box += blob_box;
378  }
379  if (blob_count > 0) {
380  if (applybox_debug > 1) {
381  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
382  }
383  if (!char_box.almost_equal(box, 3) &&
384  (box.x_gap(next_box) < -3 ||
385  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
386  return false;
387  }
388  // We refine just the box_word, best_state and correct_text here.
389  // The rebuild_word is made in TidyUp.
390  // blob_count blobs are put together to match the box. Merge the
391  // box_word boxes, save the blob_count in the state and the text.
392  word_res->box_word->MergeBoxes(i, i + blob_count);
393  word_res->best_state[i] = blob_count;
394  word_res->correct_text[i] = correct_text;
395  if (applybox_debug > 2) {
396  tprintf("%d Blobs match: blob box:", blob_count);
397  word_res->box_word->BlobBox(i).print();
398  tprintf("Matches box:");
399  box.print();
400  tprintf("With next box:");
401  next_box.print();
402  }
403  // Eliminated best_state and correct_text entries for the consumed
404  // blobs.
405  for (int j = 1; j < blob_count; ++j) {
406  word_res->best_state.remove(i + 1);
407  word_res->correct_text.remove(i + 1);
408  }
409  // Assume that no box spans multiple source words, so we are done with
410  // this box.
411  if (applybox_debug > 1) {
412  tprintf("Best state = ");
413  for (int j = 0; j < word_res->best_state.size(); ++j) {
414  tprintf("%d ", word_res->best_state[j]);
415  }
416  tprintf("\n");
417  tprintf("Correct text = [[ ");
418  for (int j = 0; j < word_res->correct_text.size(); ++j) {
419  tprintf("%s ", word_res->correct_text[j].string());
420  }
421  tprintf("]]\n");
422  }
423  return true;
424  }
425  }
426  }
427  if (applybox_debug > 0) {
428  tprintf("FAIL!\n");
429  }
430  return false; // Failure.
431 }
432 
433 // Consume all source blobs that strongly overlap the given box,
434 // putting them into a new word, with the correct_text label.
435 // Fights over which box owns which blobs are settled by
436 // applying the blobs to box or next_box with the least non-overlap.
437 // Returns false if the box was in error, which can only be caused by
438 // failing to find an overlapping blob for a box.
439 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
440  const TBOX& box, const TBOX& next_box,
441  const char* correct_text) {
442  if (applybox_debug > 1) {
443  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
444  }
445  WERD* new_word = NULL;
446  BLOCK_IT b_it(block_list);
447  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
448  BLOCK* block = b_it.data();
449  if (!box.major_overlap(block->bounding_box()))
450  continue;
451  ROW_IT r_it(block->row_list());
452  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
453  ROW* row = r_it.data();
454  if (!box.major_overlap(row->bounding_box()))
455  continue;
456  WERD_IT w_it(row->word_list());
457  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
458  WERD* word = w_it.data();
459  if (applybox_debug > 2) {
460  tprintf("Checking word:");
461  word->bounding_box().print();
462  }
463  if (word->text() != NULL && word->text()[0] != '\0')
464  continue; // Ignore words that are already done.
465  if (!box.major_overlap(word->bounding_box()))
466  continue;
467  C_BLOB_IT blob_it(word->cblob_list());
468  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
469  blob_it.forward()) {
470  C_BLOB* blob = blob_it.data();
471  TBOX blob_box = blob->bounding_box();
472  if (!blob_box.major_overlap(box))
473  continue;
474  double current_box_miss_metric = BoxMissMetric(blob_box, box);
475  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
476  if (applybox_debug > 2) {
477  tprintf("Checking blob:");
478  blob_box.print();
479  tprintf("Current miss metric = %g, next = %g\n",
480  current_box_miss_metric, next_box_miss_metric);
481  }
482  if (current_box_miss_metric > next_box_miss_metric)
483  continue; // Blob is a better match for next box.
484  if (applybox_debug > 2) {
485  tprintf("Blob match: blob:");
486  blob_box.print();
487  tprintf("Matches box:");
488  box.print();
489  tprintf("With next box:");
490  next_box.print();
491  }
492  if (new_word == NULL) {
493  // Make a new word with a single blob.
494  new_word = word->shallow_copy();
495  new_word->set_text(correct_text);
496  w_it.add_to_end(new_word);
497  }
498  C_BLOB_IT new_blob_it(new_word->cblob_list());
499  new_blob_it.add_to_end(blob_it.extract());
500  }
501  }
502  }
503  }
504  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
505  return new_word != NULL;
506 }
507 
508 // Resegments the words by running the classifier in an attempt to find the
509 // correct segmentation that produces the required string.
511  PAGE_RES_IT pr_it(page_res);
512  WERD_RES* word_res;
513  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
514  WERD* word = word_res->word;
515  if (word->text() == NULL || word->text()[0] == '\0')
516  continue; // Ignore words that have no text.
517  // Convert the correct text to a vector of UNICHAR_ID
518  GenericVector<UNICHAR_ID> target_text;
519  if (!ConvertStringToUnichars(word->text(), &target_text)) {
520  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
521  word->text());
522  pr_it.DeleteCurrentWord();
523  continue;
524  }
525  if (!FindSegmentation(target_text, word_res)) {
526  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
527  word->text());
528  pr_it.DeleteCurrentWord();
529  continue;
530  }
531  }
532 }
533 
534 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
535 // Returns false if an invalid UNICHAR_ID is encountered.
537  GenericVector<UNICHAR_ID>* class_ids) {
538  for (int step = 0; *utf8 != '\0'; utf8 += step) {
539  const char* next_space = strchr(utf8, ' ');
540  if (next_space == NULL)
541  next_space = utf8 + strlen(utf8);
542  step = next_space - utf8;
543  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
544  if (class_id == INVALID_UNICHAR_ID) {
545  return false;
546  }
547  while (utf8[step] == ' ')
548  ++step;
549  class_ids->push_back(class_id);
550  }
551  return true;
552 }
553 
554 // Resegments the word to achieve the target_text from the classifier.
555 // Returns false if the re-segmentation fails.
556 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
557 // applies a full search on the classifier results to find the best classified
558 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity
559 // substitutions ARE used.
561  WERD_RES* word_res) {
563  // Classify all required combinations of blobs and save results in choices.
564  int word_length = word_res->box_word->length();
566  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
567  for (int i = 0; i < word_length; ++i) {
568  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
569  BLOB_CHOICE_LIST* match_result = classify_piece(
570  word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
571  i, i + j - 1, word_res->blamer_bundle);
572  if (applybox_debug > 2) {
573  tprintf("%d+%d:", i, j);
574  print_ratings_list("Segment:", match_result, unicharset);
575  }
576  choices[i].push_back(match_result);
577  }
578  }
579  // Search the segmentation graph for the target text. Must be an exact
580  // match. Using wildcards makes it difficult to find the correct
581  // segmentation even when it is there.
582  word_res->best_state.clear();
583  GenericVector<int> search_segmentation;
584  float best_rating = 0.0f;
585  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
586  &search_segmentation, &best_rating, &word_res->best_state);
588  for (int i = 0; i < word_length; ++i)
589  choices[i].delete_data_pointers();
590  delete [] choices;
591  if (word_res->best_state.empty()) {
592  // Build the original segmentation and if it is the same length as the
593  // truth, assume it will do.
594  int blob_count = 1;
595  for (int s = 0; s < array_count(word_res->seam_array); ++s) {
596  SEAM* seam =
597  reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
598  if (seam->split1 == NULL) {
599  word_res->best_state.push_back(blob_count);
600  blob_count = 1;
601  } else {
602  ++blob_count;
603  }
604  }
605  word_res->best_state.push_back(blob_count);
606  if (word_res->best_state.size() != target_text.size()) {
607  word_res->best_state.clear(); // No good. Original segmentation bad size.
608  return false;
609  }
610  }
611  word_res->correct_text.clear();
612  for (int i = 0; i < target_text.size(); ++i) {
613  word_res->correct_text.push_back(
614  STRING(unicharset.id_to_unichar(target_text[i])));
615  }
616  return true;
617 }
618 
619 // Recursive helper to find a match to the target_text (from text_index
620 // position) in the choices (from choices_pos position).
621 // Choices is an array of GenericVectors, of length choices_length, with each
622 // element representing a starting position in the word, and the
623 // GenericVector holding classification results for a sequence of consecutive
624 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
626  int choices_pos, int choices_length,
627  const GenericVector<UNICHAR_ID>& target_text,
628  int text_index,
629  float rating, GenericVector<int>* segmentation,
630  float* best_rating,
631  GenericVector<int>* best_segmentation) {
633  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
634  // Rating of matching choice or worst choice if no match.
635  float choice_rating = 0.0f;
636  // Find the corresponding best BLOB_CHOICE.
637  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
638  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
639  choice_it.forward()) {
640  BLOB_CHOICE* choice = choice_it.data();
641  choice_rating = choice->rating();
642  UNICHAR_ID class_id = choice->unichar_id();
643  if (class_id == target_text[text_index]) {
644  break;
645  }
646  // Search ambigs table.
647  if (class_id < table.size() && table[class_id] != NULL) {
648  AmbigSpec_IT spec_it(table[class_id]);
649  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
650  spec_it.forward()) {
651  const AmbigSpec *ambig_spec = spec_it.data();
652  // We'll only do 1-1.
653  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
654  ambig_spec->correct_ngram_id == target_text[text_index])
655  break;
656  }
657  if (!spec_it.cycled_list())
658  break; // Found an ambig.
659  }
660  }
661  if (choice_it.cycled_list())
662  continue; // No match.
663  segmentation->push_back(length);
664  if (choices_pos + length == choices_length &&
665  text_index + 1 == target_text.size()) {
666  // This is a complete match. If the rating is good record a new best.
667  if (applybox_debug > 2) {
668  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
669  rating + choice_rating, *best_rating, segmentation->size(),
670  best_segmentation->size());
671  }
672  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
673  *best_segmentation = *segmentation;
674  *best_rating = rating + choice_rating;
675  }
676  } else if (choices_pos + length < choices_length &&
677  text_index + 1 < target_text.size()) {
678  if (applybox_debug > 3) {
679  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
680  target_text[text_index],
681  unicharset.id_to_unichar(target_text[text_index]),
682  choice_it.data()->unichar_id() == target_text[text_index]
683  ? "Match" : "Ambig",
684  choices_pos, length);
685  }
686  SearchForText(choices, choices_pos + length, choices_length, target_text,
687  text_index + 1, rating + choice_rating, segmentation,
688  best_rating, best_segmentation);
689  if (applybox_debug > 3) {
690  tprintf("End recursion for %d=%s\n", target_text[text_index],
691  unicharset.id_to_unichar(target_text[text_index]));
692  }
693  }
694  segmentation->truncate(segmentation->size() - 1);
695  }
696 }
697 
698 // Counts up the labelled words and the blobs within.
699 // Deletes all unused or emptied words, counting the unused ones.
700 // Resets W_BOL and W_EOL flags correctly.
701 // Builds the rebuild_word and rebuilds the box_word and the best_choice.
702 void Tesseract::TidyUp(PAGE_RES* page_res) {
703  int ok_blob_count = 0;
704  int bad_blob_count = 0;
705  int ok_word_count = 0;
706  int unlabelled_words = 0;
707  PAGE_RES_IT pr_it(page_res);
708  WERD_RES* word_res;
709  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
710  int ok_in_word = 0;
711  BLOB_CHOICE_LIST_VECTOR char_choices;
712  for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
713  if (word_res->correct_text[i].length() > 0) {
714  ++ok_in_word;
715  }
716  // Since we only need a fake word_res->best_choice, the actual
717  // unichar_ids do not matter. Which is fortunate, since TidyUp()
718  // can be called while training Tesseract, at the stage where
719  // unicharset is not meaningful yet.
720  char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
721  }
722  if (ok_in_word > 0) {
723  ok_blob_count += ok_in_word;
724  bad_blob_count += word_res->correct_text.size() - ok_in_word;
725  MakeWordChoice(char_choices, unicharset, word_res->best_choice);
726  } else {
727  ++unlabelled_words;
728  if (applybox_debug > 0) {
729  tprintf("APPLY_BOXES: Unlabelled word at :");
730  word_res->word->bounding_box().print();
731  }
732  pr_it.DeleteCurrentWord();
733  }
734  char_choices.delete_data_pointers();
735  }
736  pr_it.restart_page();
737  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
738  // Denormalize back to a BoxWord.
739  word_res->RebuildBestState();
740  word_res->SetupBoxWord();
741  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
742  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
743  }
744  if (applybox_debug > 0) {
745  tprintf(" Found %d good blobs.\n", ok_blob_count);
746  if (bad_blob_count > 0) {
747  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
748  bad_blob_count, ok_word_count);
749  }
750  if (unlabelled_words > 0)
751  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
752  }
753 }
754 
755 // Logs a bad box by line in the box file and box coords.
756 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
757  const char *box_ch, const char *err_msg) {
758  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
759  boxfile_lineno, box_ch,
760  box.left(), box.bottom(), box.right(), box.top(), err_msg);
761 }
762 
763 // Creates a fake best_choice entry in each WERD_RES with the correct text.
765  PAGE_RES_IT pr_it(page_res);
766  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
767  word_res = pr_it.forward()) {
768  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
769  word_res->correct_text.size());
770  for (int i = 0; i < word_res->correct_text.size(); ++i) {
771  // The part before the first space is the real ground truth, and the
772  // rest is the bounding box location and page number.
773  GenericVector<STRING> tokens;
774  word_res->correct_text[i].split(' ', &tokens);
775  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
776  choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
777  }
778  if (word_res->best_choice != NULL)
779  delete word_res->best_choice;
780  word_res->best_choice = choice;
781  }
782 }
783 
784 // Calls LearnWord to extract features for labelled blobs within each word.
785 // Features are written to the given filename.
787  PAGE_RES_IT pr_it(page_res);
788  int word_count = 0;
789  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
790  word_res = pr_it.forward()) {
791  LearnWord(filename.string(), NULL, word_res);
792  ++word_count;
793  }
794  tprintf("Generated training data for %d words\n", word_count);
795 }
796 
797 
798 } // namespace tesseract