Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
control.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: control.cpp (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 11:09:58 BST 1992
6  * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "mfcpch.h"
22 
23 #include <string.h>
24 #include <math.h>
25 #ifdef __UNIX__
26 #include <assert.h>
27 #include <unistd.h>
28 #include <errno.h>
29 #endif
30 #include <ctype.h>
31 #include "ocrclass.h"
32 #include "werdit.h"
33 #include "drawfx.h"
34 #include "tfacep.h"
35 #include "tessbox.h"
36 #include "tessvars.h"
37 #include "pgedit.h"
38 #include "reject.h"
39 #include "fixspace.h"
40 #include "docqual.h"
41 #include "control.h"
42 #include "secname.h"
43 #include "output.h"
44 #include "callcpp.h"
45 #include "notdll.h"
46 #include "globals.h"
47 #include "sorthelper.h"
48 #include "tesseractclass.h"
49 
50 // Include automatically generated configuration file if running autoconf.
51 #ifdef HAVE_CONFIG_H
52 #include "config_auto.h"
53 #endif
54 
55 #define MIN_FONT_ROW_COUNT 8
56 #define MAX_XHEIGHT_DIFF 3
57 
58 const char* const kBackUpConfigFile = "tempconfigdata.config";
59 // Multiple of x-height to make a repeated word have spaces in it.
60 const double kRepcharGapThreshold = 0.5;
61 
62 
71 namespace tesseract {
73  TBOX &selection_box) {
74  WERD *word;
75  ROW *pseudo_row; // row of word
76  BLOCK *pseudo_block; // block of word
77 
78  word = make_pseudo_word(page_res, selection_box,
79  pseudo_block, pseudo_row);
80  if (word != NULL) {
81  WERD_RES word_res(word);
82  recog_interactive(pseudo_block, pseudo_row, &word_res);
83  delete word;
84  }
85 }
86 
87 
98  inT16 char_qual;
99  inT16 good_char_qual;
100 
102  block, row, word_res);
104  word_char_quality(word_res, row, &char_qual, &good_char_qual);
105  tprintf
106  ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
107  word_res->reject_map.length(), word_blob_quality(word_res, row),
108  word_outline_errs(word_res), char_qual, good_char_qual);
109  }
110  return TRUE;
111 }
112 
113 // Helper function to check for a target word and handle it appropriately.
114 // Inspired by Jetsoft's requirement to process only single words on pass2
115 // and beyond.
116 // If word_config is not null:
117 // If the word_box and target_word_box overlap, read the word_config file
118 // else reset to previous config data.
119 // return true.
120 // else
121 // If the word_box and target_word_box overlap or pass <= 1, return true.
122 // Note that this function uses a fixed temporary file for storing the previous
123 // configs, so it is neither thread-safe, nor process-safe, but the assumption
124 // is that it will only be used for one debug window at a time.
125 //
126 // Since this function is used for debugging (and not to change OCR results)
127 // set only debug params from the word config file.
128 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
129  const TBOX& target_word_box,
130  const char* word_config,
131  int pass) {
132  if (word_config != NULL) {
133  if (word_box.major_overlap(target_word_box)) {
134  if (backup_config_file_ == NULL) {
135  backup_config_file_ = kBackUpConfigFile;
136  FILE* config_fp = fopen(backup_config_file_, "wb");
137  ParamUtils::PrintParams(config_fp, params());
138  fclose(config_fp);
139  ParamUtils::ReadParamsFile(word_config,
141  params());
142  }
143  } else {
144  if (backup_config_file_ != NULL) {
145  ParamUtils::ReadParamsFile(backup_config_file_,
147  params());
148  backup_config_file_ = NULL;
149  }
150  }
151  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
152  return false;
153  }
154  return true;
155 }
156 
179  ETEXT_DESC* monitor,
180  const TBOX* target_word_box,
181  const char* word_config,
182  int dopasses) {
183  PAGE_RES_IT page_res_it;
184  inT32 word_index; // current word
185 
187  tessedit_test_adaption.set_value (TRUE);
188  tessedit_minimal_rejection.set_value (TRUE);
189  }
190 
191  // Before the main recognition loop below, walk through the whole page and set
192  // up fake words. That way, if we run out of time a user will still get the
193  // expected best_choice and box_words out the end; they'll just be empty.
194  page_res_it.page_res = page_res;
195  for (page_res_it.restart_page(); page_res_it.word() != NULL;
196  page_res_it.forward()) {
197  page_res_it.word()->SetupFake(unicharset);
198  }
199 
200  if (dopasses==0 || dopasses==1) {
201  page_res_it.page_res=page_res;
202  page_res_it.restart_page();
203 
204  // ****************** Pass 1 *******************
205 
206  // Clear adaptive classifier at the beginning of the page if it is full.
207  // This is done only at the beginning of the page to ensure that the
208  // classifier is not reset at an arbitrary point while processing the page,
209  // which would cripple Passes 2+ if the reset happens towards the end of
210  // Pass 1 on a page with very difficult text.
211  // TODO(daria): preemptively clear the classifier if it is almost full.
213  // Now check the sub-langs as well.
214  for (int i = 0; i < sub_langs_.size(); ++i) {
215  if (sub_langs_[i]->AdaptiveClassifierIsFull())
216  sub_langs_[i]->ResetAdaptiveClassifierInternal();
217  }
218 
219  stats_.word_count = 0;
220  if (monitor != NULL) {
221  monitor->ocr_alive = TRUE;
222  while (page_res_it.word() != NULL) {
223  stats_.word_count++;
224  page_res_it.forward();
225  }
226  page_res_it.restart_page();
227  } else {
228  stats_.word_count = 1;
229  }
230 
231  word_index = 0;
232 
233  stats_.dict_words = 0;
234  stats_.doc_blob_quality = 0;
235  stats_.doc_outline_errs = 0;
236  stats_.doc_char_quality = 0;
237  stats_.good_char_count = 0;
238  stats_.doc_good_char_quality = 0;
239 
240  most_recently_used_ = this;
241  while (page_res_it.word() != NULL) {
243  word_index++;
244  if (monitor != NULL) {
245  monitor->ocr_alive = TRUE;
246  monitor->progress = 30 + 50 * word_index / stats_.word_count;
247  if (monitor->deadline_exceeded() ||
248  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
249  stats_.dict_words)))
250  return false;
251  }
252  if (target_word_box &&
253  !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
254  *target_word_box, word_config, 1)) {
255  page_res_it.forward();
256  continue;
257  }
259  page_res_it.block()->block,
260  page_res_it.row()->row,
261  page_res_it.word());
262  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
263  fix_rep_char(&page_res_it);
264  page_res_it.forward();
265  continue;
266  }
267  if (tessedit_dump_choices) {
268  word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
269  tprintf("Pass1: %s [%s]\n",
270  page_res_it.word()->best_choice->unichar_string().string(),
271  page_res_it.word()->best_choice->debug_string().string());
272  }
273 
274  // tessedit_test_adaption enables testing of the accuracy of the
275  // input to the adaptive classifier.
277  if (!word_adaptable (page_res_it.word(),
279  page_res_it.word()->reject_map.rej_word_tess_failure();
280  // FAKE PERM REJ
281  } else {
282  // Override rejection mechanisms for this word.
283  UNICHAR_ID space = unicharset.unichar_to_id(" ");
284  for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
285  if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
286  page_res_it.word()->reject_map[i].rejected())
287  page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
288  }
289  }
290  }
291 
292  // Count dict words.
293  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
294  ++(stats_.dict_words);
295 
296  // Update misadaption log (we only need to do it on pass 1, since
297  // adaption only happens on this pass).
298  if (page_res_it.word()->blamer_bundle != NULL &&
299  page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
300  page_res->misadaption_log.push_back(
301  page_res_it.word()->blamer_bundle->misadaption_debug);
302  }
303 
304  page_res_it.forward();
305  }
306  }
307 
308  if (dopasses == 1) return true;
309 
310  // ****************** Pass 2 *******************
311  page_res_it.restart_page();
312  word_index = 0;
313  most_recently_used_ = this;
314  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
316  word_index++;
317  if (monitor != NULL) {
318  monitor->ocr_alive = TRUE;
319  monitor->progress = 80 + 10 * word_index / stats_.word_count;
320  if (monitor->deadline_exceeded() ||
321  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
322  stats_.dict_words)))
323  return false;
324  }
325 
326  // changed by jetsoft
327  // specific to its needs to extract one word when need
328  if (target_word_box &&
329  !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
330  *target_word_box, word_config, 2)) {
331  page_res_it.forward();
332  continue;
333  }
334  // end jetsoft
335 
337  page_res_it.block()->block,
338  page_res_it.row()->row,
339  page_res_it.word());
340  if (page_res_it.word()->word->flag(W_REP_CHAR) &&
341  !page_res_it.word()->done) {
342  fix_rep_char(&page_res_it);
343  page_res_it.forward();
344  continue;
345  }
346  if (tessedit_dump_choices) {
347  word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
348  tprintf("Pass2: %s [%s]\n",
349  page_res_it.word()->best_choice->unichar_string().string(),
350  page_res_it.word()->best_choice->debug_string().string());
351  }
352  page_res_it.forward();
353  }
354 
355  // The next passes can only be run if tesseract has been used, as cube
356  // doesn't set all the necessary outputs in WERD_RES.
359  // ****************** Pass 3 *******************
360  // Fix fuzzy spaces.
362 
365  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
366 
367  // ****************** Pass 4 *******************
369 
370  // ****************** Pass 5,6 *******************
371  rejection_passes(page_res, monitor, target_word_box, word_config);
372 
373  // ****************** Pass 7 *******************
374  // Cube combiner.
375  // If cube is loaded and its combiner is present, run it.
377  run_cube_combiner(page_res);
378  }
379 
380  // ****************** Pass 8 *******************
381  font_recognition_pass(page_res);
382 
383  // ****************** Pass 9 *******************
384  // Check the correctness of the final results.
385  blamer_pass(page_res);
386  }
387 
388  if (!save_blob_choices) {
389  // We aren't saving the blob choices so get rid of them now.
390  // set_blob_choices() does a deep clear.
391  page_res_it.restart_page();
392  while (page_res_it.word() != NULL) {
393  WERD_RES* word = page_res_it.word();
395  page_res_it.forward();
396  }
397  }
398 
399  // Write results pass.
401  // This is now redundant, but retained commented so show how to obtain
402  // bounding boxes and style information.
403 
404  // changed by jetsoft
405  // needed for dll to output memory structure
406  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
407  output_pass(page_res_it, target_word_box);
408  // end jetsoft
409  PageSegMode pageseg_mode = static_cast<PageSegMode>(
410  static_cast<int>(tessedit_pageseg_mode));
411  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
412 
413  if (monitor != NULL) {
414  monitor->progress = 100;
415  }
416  return true;
417 }
418 
420  PAGE_RES_IT word_it(page_res);
421 
422  WERD_RES *w_prev = NULL;
423  WERD_RES *w = word_it.word();
424  while (1) {
425  w_prev = w;
426  while (word_it.forward() != NULL &&
427  (!word_it.word() || word_it.word()->part_of_combo)) {
428  // advance word_it, skipping over parts of combos
429  }
430  if (!word_it.word()) break;
431  w = word_it.word();
432  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
433  continue;
434  }
435  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
436  if (tessedit_bigram_debug) {
437  tprintf("Skipping because one of the words is W_REP_CHAR\n");
438  }
439  continue;
440  }
441  // Two words sharing the same language model, excellent!
442  if (w->alt_choices.empty()) {
443  if (tessedit_bigram_debug) {
444  tprintf("Alt choices not set up for word choice: %s\n",
446  }
447  continue;
448  }
449  if (w_prev->alt_choices.empty()) {
450  if (tessedit_bigram_debug) {
451  tprintf("Alt choices not set up for word choice: %s\n",
452  w_prev->best_choice->unichar_string().string());
453  }
454  continue;
455  }
456 
457  // We saved alternate choices, excellent!
458  GenericVector<WERD_CHOICE *> overrides_word1;
459  GenericVector<GenericVector<int> *> overrides_word1_state;
460  GenericVector<WERD_CHOICE *> overrides_word2;
461  GenericVector<GenericVector<int> *> overrides_word2_state;
462 
463  STRING orig_w1_str = w_prev->best_choice->unichar_string();
464  STRING orig_w2_str = w->best_choice->unichar_string();
465  WERD_CHOICE prev_best(w->uch_set);
466  {
467  int w1start, w1end;
468  w_prev->WithoutFootnoteSpan(&w1start, &w1end);
469  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
470  }
471  WERD_CHOICE this_best(w->uch_set);
472  {
473  int w2start, w2end;
474  w->WithoutFootnoteSpan(&w2start, &w2end);
475  this_best = w->best_choice->shallow_copy(w2start, w2end);
476  }
477 
478  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
479  if (tessedit_bigram_debug) {
480  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
481  orig_w1_str.string(), orig_w2_str.string());
482  }
483  continue;
484  }
485  if (tessedit_bigram_debug > 2) {
486  tprintf("Examining alt choices for \"%s %s\".\n",
487  orig_w1_str.string(), orig_w2_str.string());
488  }
489  if (tessedit_bigram_debug > 1) {
490  if (w_prev->alt_choices.size() > 1) {
492  }
493  if (w->alt_choices.size() > 1) {
495  }
496  }
497  float best_rating = 0.0;
498  int best_idx = 0;
499  for (int i = 0; i < w_prev->alt_choices.size(); i++) {
500  WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
501  WERD_CHOICE strip1(w->uch_set);
502  {
503  int p1start, p1end;
504  w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
505  &p1start, &p1end);
506  strip1 = p1->shallow_copy(p1start, p1end);
507  }
508  for (int j = 0; j < w->alt_choices.size(); j++) {
509  WERD_CHOICE *p2 = w->alt_choices.get(j);
510  WERD_CHOICE strip2(w->uch_set);
511  {
512  int p2start, p2end;
513  w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
514  strip2 = p2->shallow_copy(p2start, p2end);
515  }
516  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
517  overrides_word1.push_back(p1);
518  overrides_word1_state.push_back(&w_prev->alt_states.get(i));
519  overrides_word2.push_back(p2);
520  overrides_word2_state.push_back(&w->alt_states.get(j));
521  if (overrides_word1.size() == 1 ||
522  p1->rating() + p2->rating() < best_rating) {
523  best_rating = p1->rating() + p2->rating();
524  best_idx = overrides_word1.size() - 1;
525  }
526  }
527  }
528  }
529  if (overrides_word1.size() >= 1) {
530  // Excellent, we have some bigram matches.
532  *overrides_word1[best_idx]) &&
534  *overrides_word2[best_idx])) {
535  if (tessedit_bigram_debug > 1) {
536  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
537  "model.\n", orig_w1_str.string(), orig_w2_str.string());
538  }
539  continue;
540  }
541  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
542  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
543  if (new_w1_str != orig_w1_str) {
544  w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
545  *overrides_word1_state[best_idx]);
546  }
547  if (new_w2_str != orig_w2_str) {
548  w->ReplaceBestChoice(*overrides_word2[best_idx],
549  *overrides_word2_state[best_idx]);
550  }
551  if (tessedit_bigram_debug > 0) {
552  STRING choices_description;
553  int num_bigram_choices
554  = overrides_word1.size() * overrides_word2.size();
555  if (num_bigram_choices == 1) {
556  choices_description = "This was the unique bigram choice.";
557  } else {
558  if (tessedit_bigram_debug > 1) {
559  STRING bigrams_list;
560  const int kMaxChoicesToPrint = 20;
561  for (int i = 0; i < overrides_word1.size() &&
562  i < kMaxChoicesToPrint; i++) {
563  if (i > 0) { bigrams_list += ", "; }
564  WERD_CHOICE *p1 = overrides_word1[i];
565  WERD_CHOICE *p2 = overrides_word2[i];
566  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
567  if (i == kMaxChoicesToPrint) {
568  bigrams_list += " ...";
569  }
570  }
571  choices_description = "There were many choices: {";
572  choices_description += bigrams_list;
573  choices_description += "}";
574  } else {
575  choices_description.add_str_int("There were ", num_bigram_choices);
576  choices_description += " compatible bigrams.";
577  }
578  }
579  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
580  orig_w1_str.string(), orig_w2_str.string(),
581  new_w1_str.string(), new_w2_str.string(),
582  choices_description.string());
583  }
584  }
585  }
586 }
587 
589  ETEXT_DESC* monitor,
590  const TBOX* target_word_box,
591  const char* word_config) {
592  PAGE_RES_IT page_res_it(page_res);
593  // ****************** Pass 5 *******************
594  // Gather statistics on rejects.
595  int word_index = 0;
596  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
598  WERD_RES* word = page_res_it.word();
599  word_index++;
600  if (monitor != NULL) {
601  monitor->ocr_alive = TRUE;
602  monitor->progress = 95 + 5 * word_index / stats_.word_count;
603  }
604  if (word->rebuild_word == NULL) {
605  // Word was not processed by tesseract.
606  page_res_it.forward();
607  continue;
608  }
609  check_debug_pt(word, 70);
610 
611  // changed by jetsoft
612  // specific to its needs to extract one word when need
613  if (target_word_box &&
615  *target_word_box, word_config, 4)) {
616  page_res_it.forward();
617  continue;
618  }
619  // end jetsoft
620 
621  page_res_it.rej_stat_word();
622  int chars_in_word = word->reject_map.length();
623  int rejects_in_word = word->reject_map.reject_count();
624 
625  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
626  stats_.doc_blob_quality += blob_quality;
627  int outline_errs = word_outline_errs(word);
628  stats_.doc_outline_errs += outline_errs;
629  inT16 all_char_quality;
630  inT16 accepted_all_char_quality;
631  word_char_quality(word, page_res_it.row()->row,
632  &all_char_quality, &accepted_all_char_quality);
633  stats_.doc_char_quality += all_char_quality;
634  uinT8 permuter_type = word->best_choice->permuter();
635  if ((permuter_type == SYSTEM_DAWG_PERM) ||
636  (permuter_type == FREQ_DAWG_PERM) ||
637  (permuter_type == USER_DAWG_PERM)) {
638  stats_.good_char_count += chars_in_word - rejects_in_word;
639  stats_.doc_good_char_quality += accepted_all_char_quality;
640  }
641  check_debug_pt(word, 80);
643  (blob_quality == 0) && (outline_errs >= chars_in_word))
645  check_debug_pt(word, 90);
646  page_res_it.forward();
647  }
648 
650  tprintf
651  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
652  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
653  page_res->char_count, page_res->rej_count,
654  page_res->rej_count / static_cast<float>(page_res->char_count),
655  stats_.doc_blob_quality,
656  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
657  stats_.doc_outline_errs,
658  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
659  stats_.doc_char_quality,
660  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
661  stats_.doc_good_char_quality,
662  (stats_.good_char_count > 0) ?
663  (stats_.doc_good_char_quality /
664  static_cast<float>(stats_.good_char_count)) : 0.0);
665  }
666  BOOL8 good_quality_doc =
667  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
668  quality_rej_pc) &&
669  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
670  quality_blob_pc) &&
671  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
673  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
675 
676  // ****************** Pass 6 *******************
677  // Do whole document or whole block rejection pass
678  if (!tessedit_test_adaption) {
680  quality_based_rejection(page_res_it, good_quality_doc);
681  }
682 }
683 
685  if (!wordrec_run_blamer) return;
686  PAGE_RES_IT page_res_it(page_res);
687  for (page_res_it.restart_page(); page_res_it.word() != NULL;
688  page_res_it.forward()) {
689  WERD_RES *word = page_res_it.word();
690  if (word->blamer_bundle == NULL) {
691  word->blamer_bundle = new BlamerBundle();
694  word->blamer_bundle->debug += " to blame";
695  } else if (word->blamer_bundle->incorrect_result_reason ==
696  IRR_NO_TRUTH) {
697  word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
699  } else {
700  bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
701  word->blamer_bundle->truth_text);
704  if (irr == IRR_CORRECT && !correct) {
705  STRING debug = "Choice is incorrect after recognition";
706  word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
707  word->best_choice,
709  } else if (irr != IRR_CORRECT && correct) {
710  if (wordrec_debug_blamer) {
711  tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
712  }
714  word->blamer_bundle->debug = "";
715  }
716  }
718  }
719  tprintf("Blame reasons:\n");
720  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
722  static_cast<IncorrectResultReason>(bl)),
723  page_res->blame_reasons[bl]);
724  }
725  if (page_res->misadaption_log.length() > 0) {
726  tprintf("Misadaption log:\n");
727  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
728  tprintf("%s\n", page_res->misadaption_log[i].string());
729  }
730  }
731 }
732 
733 // Helper returns true if the new_word is better than the word, using a
734 // simple test of better certainty AND rating (to reduce false positives
735 // from cube) or a dictionary vs non-dictionary word.
736 static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
737  if (new_word.best_choice == NULL) {
738  return false; // New one no good.
739  }
740  if (word.best_choice == NULL) {
741  return true; // Old one no good.
742  }
743  if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
744  new_word.best_choice->rating() < word.best_choice->rating()) {
745  return true; // New word has better confidence.
746  }
747  if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
748  Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
749  return true; // New word is from a dictionary.
750  }
751  return false; // New word is no better.
752 }
753 
754 // Helper to recognize the word using the given (language-specific) tesseract.
755 // Returns true if the result was better than previously.
757  WordRecognizer recognizer) {
759  tprintf("Retrying word using lang %s, oem %d\n",
760  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
761  }
762  // Setup a trial WERD_RES in which to classify.
763  WERD_RES lang_word;
764  lang_word.InitForRetryRecognition(*word);
765  // Run the recognizer on the word.
766  // Initial version is a bit of a hack based on better certainty and rating
767  // (to reduce false positives from cube) or a dictionary vs non-dictionary
768  // word.
769  (this->*recognizer)(block, row, &lang_word);
770  bool new_is_better = NewWordBetter(*word, lang_word);
772  if (lang_word.best_choice == NULL) {
773  tprintf("New result %s better:%s\n",
774  new_is_better ? "IS" : "NOT");
775  } else {
776  tprintf("New result %s better:%s, r=%g, c=%g\n",
777  new_is_better ? "IS" : "NOT",
778  lang_word.best_choice->unichar_string().string(),
779  lang_word.best_choice->rating(),
780  lang_word.best_choice->certainty());
781  }
782  }
783  if (new_is_better) {
784  word->ConsumeWordResults(&lang_word);
785  }
786  return new_is_better;
787 }
788 
789 // Generic function for classifying a word. Can be used either for pass1 or
790 // pass2 according to the function passed to recognizer.
791 // word block and row are the current location in the document's PAGE_RES.
792 // Recognizes in the current language, and if successful that is all.
793 // If recognition was not successful, tries all available languages until
794 // it gets a successful result or runs out of languages. Keeps the best result.
796  BLOCK* block,
797  ROW *row,
798  WERD_RES *word) {
800  tprintf("Processing word with lang %s at:",
801  most_recently_used_->lang.string());
802  word->word->bounding_box().print();
803  }
804  const char* result_type = "Initial";
805  bool initially_done = !word->tess_failed && word->done;
806  if (initially_done) {
807  // If done on pass1, we reuse the tesseract that did it, and don't try
808  // any more. The only need to call the classifier at all is for the
809  // cube combiner and xheight fixing (which may be bogus on a done word.)
810  most_recently_used_ = word->tesseract;
811  result_type = "Already done";
812  }
813  (most_recently_used_->*recognizer)(block, row, word);
814  if (!word->tess_failed && word->tess_accepted)
815  result_type = "Accepted";
817  tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
818  result_type,
819  word->best_choice->unichar_string().string(),
820  word->best_choice->rating(),
821  word->best_choice->certainty(),
822  word->tess_accepted, word->tess_would_adapt);
823  }
824  if (word->tess_failed || !word->tess_accepted) {
825  // Try all the other languages to see if they are any better.
826  Tesseract* previous_used = most_recently_used_;
827  if (most_recently_used_ != this) {
828  if (classify_debug_level) {
829  tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
830  }
831  if (RetryWithLanguage(word, block, row, recognizer)) {
832  most_recently_used_ = this;
833  if (!word->tess_failed && word->tess_accepted)
834  return; // No need to look at the others.
835  }
836  }
837 
838  for (int i = 0; i < sub_langs_.size(); ++i) {
839  if (sub_langs_[i] != previous_used) {
840  if (classify_debug_level) {
841  tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
842  i, sub_langs_[i]->lang.string());
843  }
844  if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
845  most_recently_used_ = sub_langs_[i];
846  if (!word->tess_failed && word->tess_accepted)
847  return; // No need to look at the others.
848  }
849  }
850  }
851  }
852 }
853 
861  // If we only intend to run cube - run it and return.
863  cube_word_pass1(block, row, word);
864  return;
865  }
866 
867  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
868  BOOL8 adapt_ok;
869  const char *rejmap;
870  inT16 index;
871  STRING mapstr = "";
872 
873  check_debug_pt(word, 0);
874  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
877  row, block))
878  tess_segment_pass1(word, blob_choices);
879  if (!word->tess_failed) {
880  /*
881  The adaption step used to be here. It has been moved to after
882  make_reject_map so that we know whether the word will be accepted in the
883  first pass or not. This move will PREVENT adaption to words containing
884  double quotes because the word will not be identical to what tess thinks
885  its best choice is. (See CurrentBestChoiceIs in
886  stopper.cpp which is used by AdaptableWord in
887  adaptmatch.cpp)
888  */
889 
890  if (!word->word->flag(W_REP_CHAR)) {
891  // TODO(daria) delete these hacks when replaced by more generic code.
892  // Convert '' (double single) to " (single double).
893  word->fix_quotes(blob_choices);
894  if (tessedit_fix_hyphens) // turn -- to -
895  word->fix_hyphens(blob_choices);
896 
898  word->raw_choice);
899 
900  word->tess_would_adapt = word->best_choice && word->raw_choice &&
902  *word->best_choice,
903  *word->raw_choice);
904  // Also sets word->done flag
905  make_reject_map(word, blob_choices, row, 1);
906 
908 
909  if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
911  rejmap = NULL;
912  } else {
913  ASSERT_HOST(word->reject_map.length() ==
914  word->best_choice->length());
915 
916  for (index = 0; index < word->reject_map.length(); index++) {
917  if (adapt_ok || word->reject_map[index].accepted())
918  mapstr += '1';
919  else
920  mapstr += '0';
921  }
922  rejmap = mapstr.string();
923  }
924  // Send word to adaptive classifier for training.
925  word->BestChoiceToCorrectText();
926  set_word_fonts(word, blob_choices);
927  LearnWord(NULL, rejmap, word);
928  // Mark misadaptions if running blamer.
929  if (word->blamer_bundle != NULL &&
931  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
932  word->blamer_bundle->truth_text)) {
933  word->blamer_bundle->misadaption_debug ="misadapt to word (";
935  word->best_choice->permuter_name();
936  word->blamer_bundle->misadaption_debug += "): ";
938  "", word->best_choice, &(word->blamer_bundle->misadaption_debug));
939  if (wordrec_debug_blamer) {
940  tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
941  }
942  }
943  }
944 
947  }
948  }
949 
950  // Save best choices in the WERD_CHOICE if needed
951  word->best_choice->set_blob_choices(blob_choices);
952 }
953 
954 // Helper to report the result of the xheight fix.
955 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
956  WERD_RES* word, WERD_RES* new_word) {
957  tprintf("New XHT Match:%s = %s ",
958  word->best_choice->unichar_string().string(),
959  word->best_choice->debug_string().string());
960  word->reject_map.print(debug_fp);
961  tprintf(" -> %s = %s ",
962  new_word->best_choice->unichar_string().string(),
963  new_word->best_choice->debug_string().string());
964  new_word->reject_map.print(debug_fp);
965  tprintf(" %s->%s %s %s\n",
966  word->guessed_x_ht ? "GUESS" : "CERT",
967  new_word->guessed_x_ht ? "GUESS" : "CERT",
968  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
969  accept_new_word ? "ACCEPTED" : "");
970 }
971 
972 // Run the x-height fix-up, based on min/max top/bottom information in
973 // unicharset.
974 // Returns true if the word was changed.
975 // See the comment in fixxht.cpp for a description of the overall process.
976 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
977  bool accept_new_x_ht = false;
978  int original_misfits = CountMisfitTops(word);
979  if (original_misfits == 0)
980  return false;
981  float new_x_ht = ComputeCompatibleXheight(word);
982  if (new_x_ht > 0.0f) {
983  WERD_RES new_x_ht_word(word->word);
984  if (word->blamer_bundle != NULL) {
985  new_x_ht_word.blamer_bundle = new BlamerBundle();
986  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
987  }
988  new_x_ht_word.x_height = new_x_ht;
989  new_x_ht_word.caps_height = 0.0;
990  match_word_pass2(&new_x_ht_word, row, block);
991  if (!new_x_ht_word.tess_failed) {
992  int new_misfits = CountMisfitTops(&new_x_ht_word);
993  if (debug_x_ht_level >= 1) {
994  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
995  original_misfits, word->x_height,
996  new_misfits, new_x_ht);
997  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
998  word->best_choice->rating(), word->best_choice->certainty(),
999  new_x_ht_word.best_choice->rating(),
1000  new_x_ht_word.best_choice->certainty());
1001  }
1002  // The misfits must improve and either the rating or certainty.
1003  accept_new_x_ht = new_misfits < original_misfits &&
1004  (new_x_ht_word.best_choice->certainty() >
1005  word->best_choice->certainty() ||
1006  new_x_ht_word.best_choice->rating() <
1007  word->best_choice->rating());
1008  if (debug_x_ht_level >= 1) {
1009  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1010  }
1011  }
1012  if (accept_new_x_ht) {
1013  word->ConsumeWordResults(&new_x_ht_word);
1014  return true;
1015  }
1016  }
1017  return false;
1018 }
1019 
1027  // Return if we do not want to run Tesseract.
1030  return;
1031 
1032  bool done_this_pass = false;
1034  check_debug_pt(word, 30);
1035  if (!word->done || tessedit_training_tess) {
1036  word->caps_height = 0.0;
1037  if (word->x_height == 0.0f)
1038  word->x_height = row->x_height();
1039  match_word_pass2(word, row, block);
1040  done_this_pass = TRUE;
1041  check_debug_pt(word, 40);
1042  }
1043 
1044  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1045  bool accept_new_xht = false;
1047  // Use the tops and bottoms since they are available.
1048  accept_new_xht = TrainedXheightFix(word, block, row);
1049  }
1050  if (accept_new_xht)
1051  done_this_pass = true;
1052  // Test for small caps. Word capheight must be close to block xheight,
1053  // and word must contain no lower case letters, and at least one upper case.
1054  double small_cap_xheight = block->x_height() * kXHeightCapRatio;
1055  double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0;
1057  small_cap_xheight - small_cap_delta <= word->x_height &&
1058  word->x_height <= small_cap_xheight + small_cap_delta) {
1059  // Scan for upper/lower.
1060  int num_upper = 0;
1061  int num_lower = 0;
1062  for (int i = 0; i < word->best_choice->length(); ++i) {
1064  ++num_upper;
1065  else if (unicharset.get_islower(word->best_choice->unichar_id(i)))
1066  ++num_lower;
1067  }
1068  if (num_upper > 0 && num_lower == 0)
1069  word->small_caps = true;
1070  }
1071  word->SetScriptPositions();
1072 
1074  }
1075 #ifndef GRAPHICS_DISABLED
1077  if (fx_win == NULL)
1078  create_fx_win();
1079  clear_fx_win();
1080  word->rebuild_word->plot(fx_win);
1081  TBOX wbox = word->rebuild_word->bounding_box();
1082  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1083  wbox.right(), wbox.bottom());
1085  }
1086 #endif
1088  check_debug_pt(word, 50);
1089 }
1090 
1091 
1098 void Tesseract::match_word_pass2(WERD_RES *word, //word to do
1099  ROW *row,
1100  BLOCK* block) {
1101  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
1102 
1103  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
1106  row, block))
1107  tess_segment_pass2(word, blob_choices);
1108 
1109  if (!word->tess_failed) {
1110  if (!word->word->flag (W_REP_CHAR)) {
1111  word->fix_quotes(blob_choices);
1113  word->fix_hyphens(blob_choices);
1114  /* Dont trust fix_quotes! - though I think I've fixed the bug */
1115  if (word->best_choice->length() != word->box_word->length() ||
1116  word->best_choice->length() != blob_choices->length()) {
1117  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1118  " #Blobs=%d; #Choices=%d\n",
1119  word->best_choice->debug_string().string(),
1120  word->best_choice->length(),
1121  word->box_word->length(), blob_choices->length());
1122 
1123  }
1125  word->raw_choice);
1126 
1127  make_reject_map (word, blob_choices, row, 2);
1128  }
1129  }
1130 
1131  // Save best choices in the WERD_CHOICE if needed
1132  word->best_choice->set_blob_choices(blob_choices);
1133  set_word_fonts(word, blob_choices);
1134 
1135  assert (word->raw_choice != NULL);
1136 }
1137 
1138 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
1139 // unichar_id, or NULL if there is no match.
1140 static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
1141  BLOB_CHOICE_LIST* bc_list) {
1142  // Find the corresponding best BLOB_CHOICE.
1143  BLOB_CHOICE_IT choice_it(bc_list);
1144  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1145  choice_it.forward()) {
1146  BLOB_CHOICE* choice = choice_it.data();
1147  if (choice->unichar_id() == char_id) {
1148  return choice;
1149  }
1150  }
1151  return NULL;
1152 }
1153 
1154 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1155 // the given char_id, or NULL if none can be found.
1156 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1157  WERD_RES* word_res) {
1158  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1159  BLOB_CHOICE* best_choice = NULL;
1160  BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices());
1161  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1162  BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
1163  if (choice != NULL) {
1164  if (best_choice == NULL || choice->rating() < best_choice->rating())
1165  best_choice = choice;
1166  }
1167  }
1168  return best_choice;
1169 }
1170 
1171 // Helper to insert blob_choice in each location in the leader word if there is
1172 // no matching BLOB_CHOICE there already, and correct any incorrect results
1173 // in the best_choice.
1174 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1175  WERD_RES* word_res) {
1176  WERD_CHOICE* word = word_res->best_choice;
1177  BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices());
1178  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1179  BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1180  bc_it.data());
1181  if (choice == NULL) {
1182  BLOB_CHOICE_IT choice_it(bc_it.data());
1183  choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1184  }
1185  }
1186  // Correct any incorrect results in word.
1187  for (int i = 0; i < word->length(); ++i) {
1188  if (word->unichar_id(i) != blob_choice->unichar_id())
1189  word->set_unichar_id(blob_choice->unichar_id(), i);
1190  }
1191 }
1192 
1201  WERD_RES *word_res = page_res_it->word();
1202  const WERD_CHOICE &word = *(word_res->best_choice);
1203 
1204  // Find the frequency of each unique character in the word.
1205  UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
1206  SortHelper<UNICHAR_ID> rep_ch(word.length());
1207  for (int i = 0; i < word.length(); ++i) {
1208  if (word.unichar_id(i) != space)
1209  rep_ch.Add(word.unichar_id(i), 1);
1210  }
1211 
1212  // Find the most frequent result.
1213  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1214  int max_count = rep_ch.MaxCount(&maxch_id);
1215  // Find the best exemplar of a classifier result for maxch_id.
1216  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1217  if (best_choice == NULL) {
1218  tprintf("Failed to find a choice for %s, occurring %d times\n",
1219  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1220  return;
1221  }
1222  word_res->done = TRUE;
1223 
1224  // Measure the mean space.
1225  int total_gap = 0;
1226  int gap_count = 0;
1227  WERD* werd = word_res->word;
1228  C_BLOB_IT blob_it(werd->cblob_list());
1229  C_BLOB* prev_blob = blob_it.data();
1230  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1231  C_BLOB* blob = blob_it.data();
1232  int gap = blob->bounding_box().left();
1233  gap -= prev_blob->bounding_box().right();
1234  total_gap += gap;
1235  ++gap_count;
1236  prev_blob = blob;
1237  }
1238  if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
1239  // Needs spaces between.
1240  ExplodeRepeatedWord(best_choice, page_res_it);
1241  } else {
1242  // Just correct existing classification.
1243  CorrectRepcharChoices(best_choice, word_res);
1244  word_res->reject_map.initialise(word.length());
1245  }
1246 }
1247 
1248 // Explode the word at the given iterator location into individual words
1249 // of a single given unichar_id defined by best_choice.
1250 // The original word is deleted, and the replacements copy most of their
1251 // fields from the original.
1253  PAGE_RES_IT* page_res_it) {
1254  WERD_RES *word_res = page_res_it->word();
1255  ASSERT_HOST(best_choice != NULL);
1256 
1257  // Make a new word for each blob in the original.
1258  WERD* werd = word_res->word;
1259  C_BLOB_IT blob_it(werd->cblob_list());
1260  for (; !blob_it.empty(); blob_it.forward()) {
1261  bool first_blob = blob_it.at_first();
1262  bool last_blob = blob_it.at_last();
1263  WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
1264  blob_it.extract());
1265  // Note that blamer_bundle (truth information) is not copied, which is
1266  // desirable, since the newly inserted words would not have the original
1267  // bounding box corresponding to the one recorded in truth fields.
1268  WERD_RES* rep_word =
1269  page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
1270  // Setup the single char WERD_RES
1271  if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
1272  false,
1274  page_res_it->row()->row,
1275  page_res_it->block()->block)) {
1276  rep_word->CloneChoppedToRebuild();
1277  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
1278  rep_word->FakeClassifyWord(1, &blob_choice);
1279  }
1280  }
1281  page_res_it->DeleteCurrentWord();
1282 }
1283 
1285  const UNICHARSET& char_set, const char *s, const char *lengths) {
1286  int i = 0;
1287  int offset = 0;
1288  int leading_punct_count;
1289  int upper_count = 0;
1290  int hyphen_pos = -1;
1292 
1293  if (strlen (lengths) > 20)
1294  return word_type;
1295 
1296  /* Single Leading punctuation char*/
1297 
1298  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1299  offset += lengths[i++];
1300  leading_punct_count = i;
1301 
1302  /* Initial cap */
1303  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1304  offset += lengths[i++];
1305  upper_count++;
1306  }
1307  if (upper_count > 1) {
1308  word_type = AC_UPPER_CASE;
1309  } else {
1310  /* Lower case word, possibly with an initial cap */
1311  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1312  offset += lengths[i++];
1313  }
1314  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1315  goto not_a_word;
1316  /*
1317  Allow a single hyphen in a lower case word
1318  - dont trust upper case - I've seen several cases of "H" -> "I-I"
1319  */
1320  if (lengths[i] == 1 && s[offset] == '-') {
1321  hyphen_pos = i;
1322  offset += lengths[i++];
1323  if (s[offset] != '\0') {
1324  while ((s[offset] != '\0') &&
1325  char_set.get_islower(s + offset, lengths[i])) {
1326  offset += lengths[i++];
1327  }
1328  if (i < hyphen_pos + 3)
1329  goto not_a_word;
1330  }
1331  } else {
1332  /* Allow "'s" in NON hyphenated lower case words */
1333  if (lengths[i] == 1 && (s[offset] == '\'') &&
1334  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1335  offset += lengths[i++];
1336  offset += lengths[i++];
1337  }
1338  }
1339  if (upper_count > 0)
1340  word_type = AC_INITIAL_CAP;
1341  else
1342  word_type = AC_LOWER_CASE;
1343  }
1344 
1345  /* Up to two different, constrained trailing punctuation chars */
1346  if (lengths[i] == 1 && s[offset] != '\0' &&
1347  STRING(chs_trailing_punct1).contains(s[offset]))
1348  offset += lengths[i++];
1349  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1350  s[offset - lengths[i - 1]] != s[offset] &&
1351  STRING(chs_trailing_punct2).contains (s[offset]))
1352  offset += lengths[i++];
1353 
1354  if (s[offset] != '\0')
1355  word_type = AC_UNACCEPTABLE;
1356 
1357  not_a_word:
1358 
1359  if (word_type == AC_UNACCEPTABLE) {
1360  /* Look for abbreviation string */
1361  i = 0;
1362  offset = 0;
1363  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1364  word_type = AC_UC_ABBREV;
1365  while (s[offset] != '\0' &&
1366  char_set.get_isupper(s + offset, lengths[i]) &&
1367  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1368  offset += lengths[i++];
1369  offset += lengths[i++];
1370  }
1371  }
1372  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1373  word_type = AC_LC_ABBREV;
1374  while (s[offset] != '\0' &&
1375  char_set.get_islower(s + offset, lengths[i]) &&
1376  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1377  offset += lengths[i++];
1378  offset += lengths[i++];
1379  }
1380  }
1381  if (s[offset] != '\0')
1382  word_type = AC_UNACCEPTABLE;
1383  }
1384 
1385  return word_type;
1386 }
1387 
1389  BOOL8 show_map_detail = FALSE;
1390  inT16 i;
1391 
1392  #ifndef SECURE_NAMES
1393  if (!test_pt)
1394  return FALSE;
1395 
1396  tessedit_rejection_debug.set_value (FALSE);
1397  debug_x_ht_level.set_value (0);
1398 
1399  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1400  if (location < 0)
1401  return TRUE; // For breakpoint use
1402  tessedit_rejection_debug.set_value (TRUE);
1403  debug_x_ht_level.set_value (20);
1404  tprintf ("\n\nTESTWD::");
1405  switch (location) {
1406  case 0:
1407  tprintf ("classify_word_pass1 start\n");
1408  word->word->print();
1409  break;
1410  case 10:
1411  tprintf ("make_reject_map: initial map");
1412  break;
1413  case 20:
1414  tprintf ("make_reject_map: after NN");
1415  break;
1416  case 30:
1417  tprintf ("classify_word_pass2 - START");
1418  break;
1419  case 40:
1420  tprintf ("classify_word_pass2 - Pre Xht");
1421  break;
1422  case 50:
1423  tprintf ("classify_word_pass2 - END");
1424  show_map_detail = TRUE;
1425  break;
1426  case 60:
1427  tprintf ("fixspace");
1428  break;
1429  case 70:
1430  tprintf ("MM pass START");
1431  break;
1432  case 80:
1433  tprintf ("MM pass END");
1434  break;
1435  case 90:
1436  tprintf ("After Poor quality rejection");
1437  break;
1438  case 100:
1439  tprintf ("unrej_good_quality_words - START");
1440  break;
1441  case 110:
1442  tprintf ("unrej_good_quality_words - END");
1443  break;
1444  case 120:
1445  tprintf ("Write results pass");
1446  show_map_detail = TRUE;
1447  break;
1448  }
1449  tprintf(" \"%s\" ",
1450  word->best_choice->unichar_string().string());
1451  word->reject_map.print (debug_fp);
1452  tprintf ("\n");
1453  if (show_map_detail) {
1454  tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
1455  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1456  tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1457  word->reject_map[i].full_print(debug_fp);
1458  }
1459  }
1460  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1461  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1462  return TRUE;
1463  }
1464  else
1465  #endif
1466  return FALSE;
1467 }
1468 
1474 static void find_modal_font( //good chars in word
1475  STATS *fonts, //font stats
1476  inT16 *font_out, //output font
1477  inT8 *font_count //output count
1478  ) {
1479  inT16 font; //font index
1480  inT32 count; //pile couat
1481 
1482  if (fonts->get_total () > 0) {
1483  font = (inT16) fonts->mode ();
1484  *font_out = font;
1485  count = fonts->pile_count (font);
1486  *font_count = count < MAX_INT8 ? count : MAX_INT8;
1487  fonts->add (font, -*font_count);
1488  }
1489  else {
1490  *font_out = -1;
1491  *font_count = 0;
1492  }
1493 }
1494 
1501  BLOB_CHOICE_LIST_CLIST *blob_choices) {
1502  if (blob_choices == NULL) return;
1503  // Don't try to set the word fonts for a cube word, as the configs
1504  // will be meaningless.
1505  if (word->chopped_word == NULL) return;
1506 
1507  inT32 index; // char id index
1508  // character iterator
1509  BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1510  BLOB_CHOICE_IT choice_it; // choice iterator
1511  int fontinfo_size = get_fontinfo_table().size();
1512  int fontset_size = get_fontset_table().size();
1513  if (fontinfo_size == 0 || fontset_size == 0) return;
1514  STATS fonts(0, fontinfo_size); // font counters
1515 
1516  word->italic = 0;
1517  word->bold = 0;
1518  if (!word->best_choice_fontinfo_ids.empty()) {
1520  }
1521  // Compute the modal font for the word
1522  for (char_it.mark_cycle_pt(), index = 0;
1523  !char_it.cycled_list(); ++index, char_it.forward()) {
1524  UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
1525  choice_it.set_to_list(char_it.data());
1526  if (tessedit_debug_fonts) {
1527  tprintf("Examining fonts in %s\n",
1528  word->best_choice->debug_string().string());
1529  }
1530  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1531  choice_it.forward()) {
1532  UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
1533  if (blob_ch_id == word_ch_id) {
1534  if (tessedit_debug_fonts) {
1535  tprintf("%s font %s (%d) font2 %s (%d)\n",
1536  word->uch_set->id_to_unichar(blob_ch_id),
1537  choice_it.data()->fontinfo_id() < 0 ? "unknown" :
1538  fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
1539  choice_it.data()->fontinfo_id(),
1540  choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
1541  fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
1542  choice_it.data()->fontinfo_id2());
1543  }
1544  // 1st choice font gets 2 pts, 2nd choice 1 pt.
1545  if (choice_it.data()->fontinfo_id() >= 0) {
1546  fonts.add(choice_it.data()->fontinfo_id(), 2);
1547  }
1548  if (choice_it.data()->fontinfo_id2() >= 0) {
1549  fonts.add(choice_it.data()->fontinfo_id2(), 1);
1550  }
1551  break;
1552  }
1553  }
1554  }
1555  inT16 font_id1, font_id2;
1556  find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
1557  find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
1558  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1559  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1560  // All the blobs get the word's best choice font.
1561  for (int i = 0; i < word->best_choice->length(); ++i) {
1562  word->best_choice_fontinfo_ids.push_back(font_id1);
1563  }
1564  if (word->fontinfo_id_count > 0) {
1565  FontInfo fi = fontinfo_table_.get(font_id1);
1566  if (tessedit_debug_fonts) {
1567  if (word->fontinfo_id2_count > 0) {
1568  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1569  fi.name, word->fontinfo_id_count,
1570  fontinfo_table_.get(font_id2).name,
1571  word->fontinfo_id2_count);
1572  } else {
1573  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1574  fi.name, word->fontinfo_id_count);
1575  }
1576  }
1577  // 1st choices got 2 pts, so we need to halve the score for the mode.
1578  word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
1579  word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
1580  }
1581 }
1582 
1583 
1591  PAGE_RES_IT page_res_it(page_res);
1592  WERD_RES *word; // current word
1593  STATS doc_fonts(0, font_table_size_); // font counters
1594 
1595  // Gather font id statistics.
1596  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1597  page_res_it.forward()) {
1598  word = page_res_it.word();
1599  if (word->fontinfo != NULL) {
1600  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1601  }
1602  if (word->fontinfo2 != NULL) {
1603  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1604  }
1605  }
1606  inT16 doc_font; // modal font
1607  inT8 doc_font_count; // modal font
1608  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1609  if (doc_font_count == 0)
1610  return;
1611  // Get the modal font pointer.
1612  const FontInfo* modal_font = NULL;
1613  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1614  page_res_it.forward()) {
1615  word = page_res_it.word();
1616  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
1617  modal_font = word->fontinfo;
1618  break;
1619  }
1620  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
1621  modal_font = word->fontinfo2;
1622  break;
1623  }
1624  }
1625  ASSERT_HOST(modal_font != NULL);
1626 
1627  // Assign modal font to weak words.
1628  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1629  page_res_it.forward()) {
1630  word = page_res_it.word();
1631  int length = word->best_choice->length();
1632 
1633  // 1st choices got 2 pts, so we need to halve the score for the mode.
1634  int count = (word->fontinfo_id_count + 1) / 2;
1635  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1636  word->fontinfo = modal_font;
1637  // Counts only get 1 as it came from the doc.
1638  word->fontinfo_id_count = 1;
1639  word->italic = modal_font->is_italic() ? 1 : -1;
1640  word->bold = modal_font->is_bold() ? 1 : -1;
1641  }
1642  }
1643 }
1644 
1645 } // namespace tesseract