Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
language_model.cpp
Go to the documentation of this file.
1 
2 // File: language_model.cpp
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 // Created: Mon Nov 11 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include <math.h>
22 
23 #include "language_model.h"
24 
25 #include "dawg.h"
26 #include "intproto.h"
27 #include "matrix.h"
28 #include "params.h"
30 
31 namespace tesseract {
32 
33 ELISTIZE(ViterbiStateEntry);
34 
39 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
41 const float LanguageModel::kLooseMaxCharWhRatio = 2.5f;
42 
44  Dict *dict)
45  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
46  dict->getImage()->getCCUtil()->params()),
47  BOOL_INIT_MEMBER(language_model_ngram_on, false,
48  "Turn on/off the use of character ngram model",
49  dict->getImage()->getCCUtil()->params()),
50  INT_MEMBER(language_model_ngram_order, 8,
51  "Maximum order of the character ngram model",
52  dict->getImage()->getCCUtil()->params()),
53  INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
54  "Maximum number of prunable (those for which"
55  " PrunablePath() is true) entries in each viterbi list"
56  " recorded in BLOB_CHOICEs",
57  dict->getImage()->getCCUtil()->params()),
58  INT_MEMBER(language_model_viterbi_list_max_size, 500,
59  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
60  dict->getImage()->getCCUtil()->params()),
61  double_MEMBER(language_model_ngram_small_prob, 0.000001,
62  "To avoid overly small denominators use this as the "
63  "floor of the probability returned by the ngram model.",
64  dict->getImage()->getCCUtil()->params()),
65  double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
66  "Average classifier score of a non-matching unichar.",
67  dict->getImage()->getCCUtil()->params()),
68  BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
69  "Use only the first UTF8 step of the given string"
70  " when computing log probabilities.",
71  dict->getImage()->getCCUtil()->params()),
72  double_MEMBER(language_model_ngram_scale_factor, 0.03,
73  "Strength of the character ngram model relative to the"
74  " character classifier ",
75  dict->getImage()->getCCUtil()->params()),
76  BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
77  "Words are delimited by space",
78  dict->getImage()->getCCUtil()->params()),
79  INT_MEMBER(language_model_min_compound_length, 3,
80  "Minimum length of compound words",
81  dict->getImage()->getCCUtil()->params()),
82  INT_MEMBER(language_model_fixed_length_choices_depth, 3,
83  "Depth of blob choice lists to explore"
84  " when fixed length dawgs are on",
85  dict->getImage()->getCCUtil()->params()),
86  double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
87  "Penalty for words not in the frequent word dictionary",
88  dict->getImage()->getCCUtil()->params()),
89  double_MEMBER(language_model_penalty_non_dict_word, 0.15,
90  "Penalty for non-dictionary words",
91  dict->getImage()->getCCUtil()->params()),
92  double_MEMBER(language_model_penalty_punc, 0.2,
93  "Penalty for inconsistent punctuation",
94  dict->getImage()->getCCUtil()->params()),
95  double_MEMBER(language_model_penalty_case, 0.1,
96  "Penalty for inconsistent case",
97  dict->getImage()->getCCUtil()->params()),
98  double_MEMBER(language_model_penalty_script, 0.5,
99  "Penalty for inconsistent script",
100  dict->getImage()->getCCUtil()->params()),
101  double_MEMBER(language_model_penalty_chartype, 0.3,
102  "Penalty for inconsistent character type",
103  dict->getImage()->getCCUtil()->params()),
104  // TODO(daria, rays): enable font consistency checking
105  // after improving font analysis.
106  double_MEMBER(language_model_penalty_font, 0.00,
107  "Penalty for inconsistent font",
108  dict->getImage()->getCCUtil()->params()),
109  double_MEMBER(language_model_penalty_spacing, 0.05,
110  "Penalty for inconsistent spacing",
111  dict->getImage()->getCCUtil()->params()),
112  double_MEMBER(language_model_penalty_increment, 0.01,
113  "Penalty increment",
114  dict->getImage()->getCCUtil()->params()),
115  BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
116  "Use sigmoidal score for certainty",
117  dict->getImage()->getCCUtil()->params()),
118  fontinfo_table_(fontinfo_table), dict_(dict),
119  fixed_pitch_(false), max_char_wh_ratio_(0.0),
120  acceptable_choice_found_(false) {
121  ASSERT_HOST(dict_ != NULL);
123  new DawgInfoVector(),
124  0.0, NO_PERM, kAnyWordLength, -1);
129 }
130 
133  delete beginning_constraints_;
135  delete empty_dawg_info_vec_;
138  delete dawg_args_;
139 }
140 
142  const WERD_CHOICE *prev_word,
143  bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio,
144  float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record,
145  BlamerBundle *blamer_bundle, bool debug_blamer) {
146  fixed_pitch_ = fixed_pitch;
147  max_char_wh_ratio_ = max_char_wh_ratio;
148  rating_cert_scale_ = rating_cert_scale;
149  acceptable_choice_found_ = false;
151 
152  // For each cell, generate a "pain point" if the cell is not classified
153  // and has a left or right neighbor that was classified.
154  MATRIX *ratings = chunks_record->ratings;
155  for (int col = 0; col < ratings->dimension(); ++col) {
156  for (int row = col+1; row < ratings->dimension(); ++row) {
157  if ((row > 0 && ratings->get(col, row-1) != NOT_CLASSIFIED) ||
158  (col+1 < ratings->dimension() &&
159  ratings->get(col+1, row) != NOT_CLASSIFIED)) {
160  float worst_piece_cert;
161  bool fragmented;
162  GetWorstPieceCertainty(col, row, chunks_record->ratings,
163  &worst_piece_cert, &fragmented);
165  worst_piece_cert, fragmented, best_choice_cert,
167  chunks_record, pain_points);
168  }
169  }
170  }
171 
172  // Initialize vectors with beginning DawgInfos.
174  dict_->init_active_dawgs(kAnyWordLength, beginning_active_dawgs_, false);
177  if (dict_->GetMaxFixedLengthDawgIndex() >= 0) {
179  for (int i = 0; i < beginning_active_dawgs_->size(); ++i) {
180  int dawg_index = (*beginning_active_dawgs_)[i].dawg_index;
181  if (dawg_index <= dict_->GetMaxFixedLengthDawgIndex() &&
182  dawg_index >= kMinFixedLengthDawgLength) {
183  *fixed_length_beginning_active_dawgs_ += (*beginning_active_dawgs_)[i];
184  }
185  }
186  }
187 
190 
191  // Fill prev_word_str_ with the last language_model_ngram_order
192  // unichars from prev_word.
194  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
195  prev_word_str_ = prev_word->unichar_string();
197  } else {
198  prev_word_str_ += ' ';
199  }
200  const char *str_ptr = prev_word_str_.string();
201  const char *str_end = str_ptr + prev_word_str_.length();
202  int step;
204  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
205  str_ptr += step;
207  }
208  ASSERT_HOST(str_ptr == str_end);
209  }
210 
211  // Initialize blamer-related information: map character boxes recorded in
212  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
213  // ratings matrix. We expect this step to succeed, since when running the
214  // chopper we checked that the correct chops are present.
215  if (blamer_bundle != NULL &&
216  blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
217  blamer_bundle->truth_has_char_boxes) {
218  STRING blamer_debug;
219  blamer_debug += "Blamer computing correct_segmentation_cols\n";
220  int curr_box_col = 0;
221  int next_box_col = 0;
222  TBLOB *blob = chunks_record->chunks;
223  inT16 next_box_x = (blob != NULL) ? blob->bounding_box().right() : 0;
224  for (int truth_idx = 0;
225  blob != NULL && truth_idx < blamer_bundle->norm_truth_word.length();
226  blob = blob->next) {
227  ++next_box_col;
228  inT16 curr_box_x = next_box_x;
229  if (blob->next != NULL) next_box_x = blob->next->bounding_box().right();
230  inT16 truth_x = blamer_bundle->norm_truth_word.BlobBox(truth_idx).right();
231  blamer_debug.add_str_int("Box x coord vs. truth: ", curr_box_x);
232  blamer_debug.add_str_int(" ", truth_x);
233  blamer_debug += "\n";
234  if (curr_box_x > (truth_x + blamer_bundle->norm_box_tolerance)) {
235  break; // failed to find a matching box
236  } else if (curr_box_x >=
237  (truth_x - blamer_bundle->norm_box_tolerance) && // matched
238  (blob->next == NULL || // next box can't be included
239  next_box_x > truth_x + blamer_bundle->norm_box_tolerance)) {
240  blamer_bundle->correct_segmentation_cols.push_back(curr_box_col);
241  blamer_bundle->correct_segmentation_rows.push_back(next_box_col-1);
242  ++truth_idx;
243  blamer_debug.add_str_int("col=", curr_box_col);
244  blamer_debug.add_str_int(" row=", next_box_col-1);
245  blamer_debug += "\n";
246  curr_box_col = next_box_col;
247  }
248  }
249  if (blob != NULL || // trailing blobs
250  blamer_bundle->correct_segmentation_cols.length() !=
251  blamer_bundle->norm_truth_word.length()) {
252  blamer_debug.add_str_int("Blamer failed to find correct segmentation"
253  " (tolerance=", blamer_bundle->norm_box_tolerance);
254  if (blob == NULL) blamer_debug += " blob == NULL";
255  blamer_debug += ")\n";
256  blamer_debug.add_str_int(
257  " path length ", blamer_bundle->correct_segmentation_cols.length());
258  blamer_debug.add_str_int(" vs. truth ",
259  blamer_bundle->norm_truth_word.length());
260  blamer_debug += "\n";
261  blamer_bundle->SetBlame(IRR_UNKNOWN, blamer_debug, NULL, debug_blamer);
262  blamer_bundle->correct_segmentation_cols.clear();
263  blamer_bundle->correct_segmentation_rows.clear();
264  }
265  } // end if (blamer_bundle != NULL)
266 
267  // Start a new hypothesis list for this run of segmentation search.
268  if (blamer_bundle != NULL) {
270  }
271 }
272 
274  for (int i = 0; i < updated_flags_.size(); ++i) {
275  *(updated_flags_[i]) = false;
276  }
278 }
279 
280 void LanguageModel::DeleteState(BLOB_CHOICE_LIST *choices) {
281  BLOB_CHOICE_IT b_it(choices);
282  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
283  if (b_it.data()->language_model_state() != NULL) {
284  LanguageModelState *state = reinterpret_cast<LanguageModelState *>(
285  b_it.data()->language_model_state());
286  delete state;
287  b_it.data()->set_language_model_state(NULL);
288  }
289  }
290 }
291 
293  LanguageModelFlagsType changed,
294  int curr_col, int curr_row,
295  BLOB_CHOICE_LIST *curr_list,
296  BLOB_CHOICE_LIST *parent_list,
297  HEAP *pain_points,
298  BestPathByColumn *best_path_by_column[],
299  CHUNKS_RECORD *chunks_record,
300  BestChoiceBundle *best_choice_bundle,
301  BlamerBundle *blamer_bundle) {
302  if (language_model_debug_level > 0) {
303  tprintf("\nUpdateState: col=%d row=%d (changed=0x%x parent=%p)\n",
304  curr_col, curr_row, changed, parent_list);
305  }
306  // Initialize helper variables.
307  bool word_end = (curr_row+1 >= chunks_record->ratings->dimension());
308  bool just_classified = (changed & kJustClassifiedFlag);
309  LanguageModelFlagsType new_changed = 0x0;
310  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
311 
312  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
313  ViterbiStateEntry_IT vit;
314  BLOB_CHOICE_IT c_it(curr_list);
315  int c_it_counter = 0;
316  bool first_iteration = true;
317  BLOB_CHOICE *first_lower = NULL;
318  BLOB_CHOICE *first_upper = NULL;
319  GetTopChoiceLowerUpper(changed, curr_list, &first_lower, &first_upper);
320  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
321  if (dict_->GetMaxFixedLengthDawgIndex() >= 0 &&
322  c_it_counter++ >= language_model_fixed_length_choices_depth) {
323  break;
324  }
325  // Skip NULL unichars unless it is the only choice.
326  if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
327  if (dict_->getUnicharset().get_fragment(c_it.data()->unichar_id())) {
328  continue; // skip fragments
329  }
330  // Set top choice flags.
331  LanguageModelFlagsType top_choice_flags = 0x0;
332  if (first_iteration && (changed | kSmallestRatingFlag)) {
333  top_choice_flags |= kSmallestRatingFlag;
334  }
335  if (first_lower == c_it.data()) top_choice_flags |= kLowerCaseFlag;
336  if (first_upper == c_it.data()) top_choice_flags |= kUpperCaseFlag;
337 
338  if (parent_list == NULL) { // process the beginning of a word
339  new_changed |= AddViterbiStateEntry(
340  top_choice_flags, denom, word_end, curr_col, curr_row,
341  c_it.data(), NULL, NULL, pain_points, best_path_by_column,
342  chunks_record, best_choice_bundle, blamer_bundle);
343  } else { // get viterbi entries from each of the parent BLOB_CHOICEs
344  BLOB_CHOICE_IT p_it(parent_list);
345  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
346  LanguageModelState *parent_lms =
347  reinterpret_cast<LanguageModelState *>(
348  p_it.data()->language_model_state());
349  if (parent_lms == NULL || parent_lms->viterbi_state_entries.empty()) {
350  continue;
351  }
352  vit.set_to_list(&(parent_lms->viterbi_state_entries));
353  int vit_counter = 0;
354  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
355  // Skip pruned entries and do not look at prunable entries if already
356  // examined language_model_viterbi_list_max_num_prunable of those.
357  if (PrunablePath(vit.data()->top_choice_flags,
358  vit.data()->dawg_info) &&
360  (language_model_ngram_on && vit.data()->ngram_info->pruned))) {
361  continue;
362  }
363  // Only consider the parent if it has been updated or
364  // if the current ratings cell has just been classified.
365  if (!just_classified && !vit.data()->updated) continue;
366  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
367  // looks good according to the Dawgs or character ngram model.
368  new_changed |= AddViterbiStateEntry(
369  top_choice_flags, denom, word_end, curr_col, curr_row,
370  c_it.data(), p_it.data(), vit.data(), pain_points,
371  best_path_by_column, chunks_record,
372  best_choice_bundle, blamer_bundle);
373  }
374  } // done looking at parents for this c_it.data()
375  }
376  first_iteration = false;
377  }
378  return new_changed;
379 }
380 
382  UNICHAR_ID unichar_id, bool word_end) {
383  // The path is problematic if it is inconsistent and has a parent that
384  // is consistent (or a NULL parent).
385  if (!vse.Consistent() && (vse.parent_vse == NULL ||
386  vse.parent_vse->Consistent())) {
387  if (language_model_debug_level > 0) {
388  tprintf("ProblematicPath: inconsistent\n");
389  }
390  return true;
391  }
392  // The path is problematic if it does not represent a dictionary word,
393  // while its parent does.
394  if (vse.dawg_info == NULL &&
395  (vse.parent_vse == NULL || vse.parent_vse->dawg_info != NULL)) {
396  if (language_model_debug_level > 0) {
397  tprintf("ProblematicPath: dict word terminated\n");
398  }
399  return true;
400  }
401  // The path is problematic if ngram info indicates that this path is
402  // an unlikely sequence of character, while its parent is does not have
403  // extreme dips in ngram probabilities.
404  if (vse.ngram_info != NULL && vse.ngram_info->pruned &&
405  (vse.parent_vse == NULL || !vse.parent_vse->ngram_info->pruned)) {
406  if (language_model_debug_level > 0) {
407  tprintf("ProblematicPath: small ngram prob\n");
408  }
409  return true;
410  }
411  // The path is problematic if there is a non-alpha character in the
412  // middle of the path (unless it is a digit in the middle of a path
413  // that represents a number).
414  if ((vse.parent_vse != NULL) && !word_end && // is middle
415  !(dict_->getUnicharset().get_isalpha(unichar_id) || // alpha
416  (dict_->getUnicharset().get_isdigit(unichar_id) && // ok digit
417  vse.dawg_info != NULL && vse.dawg_info->permuter == NUMBER_PERM))) {
418  if (language_model_debug_level > 0) {
419  tprintf("ProblematicPath: non-alpha middle\n");
420  }
421  return true;
422  }
423  return false;
424 }
425 
427  BLOB_CHOICE_LIST *curr_list,
428  BLOB_CHOICE **first_lower,
429  BLOB_CHOICE **first_upper) {
430  if (!(changed & kLowerCaseFlag || changed & kUpperCaseFlag)) return;
431  BLOB_CHOICE_IT c_it(curr_list);
432  const UNICHARSET &unicharset = dict_->getUnicharset();
433  BLOB_CHOICE *first_unichar = NULL;
434  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
435  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
436  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
437  if (first_unichar == NULL) first_unichar = c_it.data();
438  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
439  *first_lower = c_it.data();
440  }
441  if (*first_upper == NULL && unicharset.get_isupper(unichar_id)) {
442  *first_upper = c_it.data();
443  }
444  }
445  ASSERT_HOST(first_unichar != NULL);
446  if (*first_lower == NULL) *first_lower = first_unichar;
447  if (*first_upper == NULL) *first_upper = first_unichar;
448 }
449 
451  LanguageModelFlagsType top_choice_flags,
452  float denom,
453  bool word_end,
454  int curr_col, int curr_row,
455  BLOB_CHOICE *b,
456  BLOB_CHOICE *parent_b,
457  ViterbiStateEntry *parent_vse,
458  HEAP *pain_points,
459  BestPathByColumn *best_path_by_column[],
460  CHUNKS_RECORD *chunks_record,
461  BestChoiceBundle *best_choice_bundle,
462  BlamerBundle *blamer_bundle) {
463  ViterbiStateEntry_IT vit;
464  if (language_model_debug_level > 0) {
465  tprintf("\nAddViterbiStateEntry for unichar %s rating=%.4f"
466  " certainty=%.4f top_choice_flags=0x%x parent_vse=%p\n",
468  b->rating(), b->certainty(), top_choice_flags, parent_vse);
470  tprintf("Existing viterbi list:\n");
471  vit.set_to_list(&(reinterpret_cast<LanguageModelState *>(
472  b->language_model_state())->viterbi_state_entries));
473  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
474  PrintViterbiStateEntry("", vit.data(), b, chunks_record);
475  }
476  }
477  }
478  // Check whether the list is full.
479  if (b->language_model_state() != NULL &&
480  (reinterpret_cast<LanguageModelState *>(
481  b->language_model_state())->viterbi_state_entries_length >=
483  if (language_model_debug_level > 1) {
484  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
485  }
486  return 0x0;
487  }
488 
489  LanguageModelFlagsType changed = 0x0;
490  float optimistic_cost = 0.0f;
491  if (!language_model_ngram_on) optimistic_cost += b->rating();
492  if (parent_vse != NULL) optimistic_cost += parent_vse->cost;
493  // Discard this entry if it will not beat best choice.
494  if (optimistic_cost >= best_choice_bundle->best_choice->rating()) {
495  if (language_model_debug_level > 1) {
496  tprintf("Discarded ViterbiEntry with high cost %.4f"
497  " best_choice->rating()=%.4f\n", optimistic_cost,
498  best_choice_bundle->best_choice->rating());
499  }
500  return 0x0;
501  }
502 
503  // Check consistency of the path and set the relevant consistency_info.
504  LanguageModelConsistencyInfo consistency_info;
505  FillConsistencyInfo(curr_col, word_end, b, parent_vse, parent_b,
506  chunks_record, &consistency_info);
507 
508  // Invoke Dawg language model component.
509  LanguageModelDawgInfo *dawg_info =
510  GenerateDawgInfo(word_end, consistency_info.script_id,
511  curr_col, curr_row, *b, parent_vse, &changed);
512 
513  // Invoke TopChoice language model component
514  float ratings_sum = b->rating();
515  if (parent_vse != NULL) ratings_sum += parent_vse->ratings_sum;
516  GenerateTopChoiceInfo(ratings_sum, dawg_info, consistency_info,
517  parent_vse, b, &top_choice_flags, &changed);
518 
519  // Invoke Ngram language model component.
520  LanguageModelNgramInfo *ngram_info = NULL;
522  ngram_info = GenerateNgramInfo(
524  denom, curr_col, curr_row, parent_vse, parent_b, &changed);
525  ASSERT_HOST(ngram_info != NULL);
526  }
527 
528  // Prune non-top choice paths with inconsistent scripts.
529  if (consistency_info.inconsistent_script) {
530  if (!(top_choice_flags & kSmallestRatingFlag)) changed = 0x0;
531  if (ngram_info != NULL) ngram_info->pruned = true;
532  }
533 
534  // If language model components did not like this unichar - return
535  if (!changed) {
536  if (language_model_debug_level > 0) {
537  tprintf("Language model components did not like this entry\n");
538  }
539  delete dawg_info;
540  delete ngram_info;
541  return 0x0;
542  }
543 
544  // Compute cost of associating the blobs that represent the current unichar.
545  AssociateStats associate_stats;
546  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
547  parent_vse, chunks_record, &associate_stats);
548  if (parent_vse != NULL) {
549  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
550  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
551  }
552 
553  // Compute the aggregate cost (adjusted ratings sum).
554  float cost = ComputeAdjustedPathCost(
555  ratings_sum,
556  (parent_vse == NULL) ? 1 : (parent_vse->length+1),
557  (dawg_info == NULL) ? 0.0f : 1.0f,
558  dawg_info, ngram_info, consistency_info, associate_stats, parent_vse);
559 
560  if (b->language_model_state() == NULL) {
561  b->set_language_model_state(new LanguageModelState(curr_col, curr_row));
562  }
563  LanguageModelState *lms =
564  reinterpret_cast<LanguageModelState *>(b->language_model_state());
565 
566  // Discard this entry if it represents a prunable path and
567  // language_model_viterbi_list_max_num_prunable such entries with a lower
568  // cost have already been recorded.
569  if (PrunablePath(top_choice_flags, dawg_info) &&
573  if (language_model_debug_level > 1) {
574  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
576  }
577  delete dawg_info;
578  delete ngram_info;
579  return 0x0;
580  }
581 
582  // Create the new ViterbiStateEntry and add it to lms->viterbi_state_entries
583  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
584  parent_b, parent_vse, b, cost, ComputeOutlineLength(b), consistency_info,
585  associate_stats, top_choice_flags, dawg_info, ngram_info);
586  updated_flags_.push_back(&(new_vse->updated));
588  false, new_vse);
590  if (PrunablePath(top_choice_flags, dawg_info)) {
592  }
593 
594  // Update lms->viterbi_state_entries_prunable_max_cost and clear
595  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
597  language_model_viterbi_list_max_num_prunable) || top_choice_flags) {
598  ASSERT_HOST(!lms->viterbi_state_entries.empty());
599  int prunable_counter = language_model_viterbi_list_max_num_prunable;
600  vit.set_to_list(&(lms->viterbi_state_entries));
601  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
602  ViterbiStateEntry *curr_vse = vit.data();
603  // Clear the appropriate top choice flags of the entries in the
604  // list that have ratings_sum higher thank new_entry->ratings_sum
605  // (since they will not be top choices any more).
606  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
608  curr_vse->ratings_sum, curr_vse->dawg_info,
609  curr_vse->consistency_info) >
611  new_vse->ratings_sum, new_vse->dawg_info,
612  new_vse->consistency_info)) {
613  curr_vse->top_choice_flags &= ~(top_choice_flags);
614  }
615  if (prunable_counter > 0 &&
616  PrunablePath(curr_vse->top_choice_flags, curr_vse->dawg_info)) {
617  --prunable_counter;
618  }
619  // Update lms->viterbi_state_entries_prunable_max_cost.
620  if (prunable_counter == 0) {
621  lms->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
622  if (language_model_debug_level > 1) {
623  tprintf("Set viterbi_state_entries_prunable_max_cost to %.4f\n",
625  }
626  prunable_counter = -1; // stop counting
627  }
628  }
629  }
630 
631  // Print the newly created ViterbiStateEntry.
632  if (language_model_debug_level > 2) {
633  PrintViterbiStateEntry("New", new_vse, b, chunks_record);
634  if (language_model_debug_level > 3) {
635  tprintf("Updated viterbi list (max cost %g):\n",
637  vit.set_to_list(&(lms->viterbi_state_entries));
638  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
639  PrintViterbiStateEntry("", vit.data(), b, chunks_record);
640  }
641  }
642  }
643 
644  // Update best choice if needed.
645  if (word_end) {
646  UpdateBestChoice(b, new_vse, pain_points, chunks_record,
647  best_choice_bundle, blamer_bundle);
648  }
649 
650  // Update stats in best_path_by_column.
651  if (new_vse->Consistent() || new_vse->dawg_info != NULL ||
652  (new_vse->ngram_info != NULL && !new_vse->ngram_info->pruned)) {
653  float avg_cost = new_vse->cost / static_cast<float>(curr_row+1);
654  for (int c = curr_col; c <= curr_row; ++c) {
655  if (avg_cost < (*best_path_by_column)[c].avg_cost) {
656  (*best_path_by_column)[c].avg_cost = avg_cost;
657  (*best_path_by_column)[c].best_vse = new_vse;
658  (*best_path_by_column)[c].best_b = b;
659  if (language_model_debug_level > 0) {
660  tprintf("Set best_path_by_column[%d]=(%g %p)\n",
661  c, avg_cost, new_vse);
662  }
663  }
664  }
665  }
666  return changed;
667 }
668 
670  const char *msg, ViterbiStateEntry *vse,
671  BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record) {
672  tprintf("%s ViterbiStateEntry %p with ratings_sum=%.4f length=%d cost=%.4f",
673  msg, vse, vse->ratings_sum, vse->length, vse->cost);
674  if (vse->top_choice_flags) {
675  tprintf(" top_choice_flags=0x%x", vse->top_choice_flags);
676  }
677  if (!vse->Consistent()) {
678  tprintf(" inconsistent=(punc %d case %d chartype %d script %d)\n",
683  }
684  if (vse->dawg_info) tprintf(" permuter=%d", vse->dawg_info->permuter);
685  if (vse->ngram_info) {
686  tprintf(" ngram_cost=%g context=%s ngram pruned=%d",
687  vse->ngram_info->ngram_cost,
688  vse->ngram_info->context.string(),
689  vse->ngram_info->pruned);
690  }
691  if (vse->associate_stats.shape_cost > 0.0f) {
692  tprintf(" shape_cost=%g", vse->associate_stats.shape_cost);
693  }
694  if (language_model_debug_level > 3) {
695  STRING wd_str;
696  WERD_CHOICE *wd = ConstructWord(b, vse, chunks_record,
697  NULL, NULL, NULL, NULL, NULL, NULL);
698  wd->string_and_lengths(&wd_str, NULL);
699  delete wd;
700  tprintf(" str=%s", wd_str.string());
701  }
702  tprintf("\n");
703 }
704 
706  float ratings_sum,
707  const LanguageModelDawgInfo *dawg_info,
708  const LanguageModelConsistencyInfo &consistency_info,
709  const ViterbiStateEntry *parent_vse,
710  BLOB_CHOICE *b,
711  LanguageModelFlagsType *top_choice_flags,
712  LanguageModelFlagsType *changed) {
714  ratings_sum, dawg_info, consistency_info);
715  // Clear flags that do not agree with parent_vse->top_choice_flags.
716  if (parent_vse != NULL) *top_choice_flags &= parent_vse->top_choice_flags;
717  if (consistency_info.Consistent()) *top_choice_flags |= kConsistentFlag;
718  if (*top_choice_flags == 0x0) return;
719  LanguageModelState *lms =
720  reinterpret_cast<LanguageModelState *>(b->language_model_state());
721  if (lms != NULL && !lms->viterbi_state_entries.empty()) {
722  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
723  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
724  if (ratings_sum >= ComputeConsistencyAdjustedRatingsSum(
725  vit.data()->ratings_sum, vit.data()->dawg_info,
726  vit.data()->consistency_info)) {
727  // Clear the appropriate flags if the list already contains
728  // a top choice entry with a lower cost.
729  *top_choice_flags &= ~(vit.data()->top_choice_flags);
730  }
731  }
732  }
733  if (language_model_debug_level > 0) {
734  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
735  *top_choice_flags);
736  }
737  *changed |= *top_choice_flags;
738 }
739 
741  bool word_end, int script_id,
742  int curr_col, int curr_row,
743  const BLOB_CHOICE &b,
744  const ViterbiStateEntry *parent_vse,
745  LanguageModelFlagsType *changed) {
746  bool use_fixed_length_dawgs = !fixed_length_beginning_active_dawgs_->empty();
747 
748  // Initialize active_dawgs and constraints from parent_vse if it is not NULL,
749  // otherwise use beginning_active_dawgs_ and beginning_constraints_.
750  if (parent_vse == NULL ||
751  (use_fixed_length_dawgs && parent_vse->dawg_info == NULL)) {
754  dawg_args_->permuter = NO_PERM;
755  } else {
756  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
759  dawg_args_->permuter = parent_vse->dawg_info->permuter;
760  }
761 
762  // Deal with hyphenated words.
763  if (!use_fixed_length_dawgs && word_end &&
764  dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
765  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
766  *changed |= kDawgFlag;
769  COMPOUND_PERM);
770  }
771 
772  // Deal with compound words.
773  if (!use_fixed_length_dawgs && dict_->compound_marker(b.unichar_id()) &&
774  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
775  if (language_model_debug_level > 0) tprintf("Found compound marker");
776  // Do not allow compound operators at the beginning and end of the word.
777  // Do not allow more than one compound operator per word.
778  // Do not allow compounding of words with lengths shorter than
779  // language_model_min_compound_length
780  if (parent_vse == NULL || word_end ||
781  dawg_args_->permuter == COMPOUND_PERM ||
782  parent_vse->length < language_model_min_compound_length) return NULL;
783 
784  int i;
785  // Check a that the path terminated before the current character is a word.
786  bool has_word_ending = false;
787  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
788  const DawgInfo &info = (*parent_vse->dawg_info->active_dawgs)[i];
789  const Dawg *pdawg = dict_->GetDawg(info.dawg_index);
790  assert(pdawg != NULL);
791  if (pdawg->type() == DAWG_TYPE_WORD && info.ref != NO_EDGE &&
792  pdawg->end_of_word(info.ref)) {
793  has_word_ending = true;
794  break;
795  }
796  }
797  if (!has_word_ending) return NULL;
798 
799  // Return LanguageModelDawgInfo with active_dawgs set to
800  // the beginning dawgs of type DAWG_TYPE_WORD.
801  if (language_model_debug_level > 0) tprintf("Compound word found\n");
802  DawgInfoVector beginning_word_dawgs;
803  for (i = 0; i < beginning_active_dawgs_->size(); ++i) {
804  const Dawg *bdawg =
805  dict_->GetDawg((*beginning_active_dawgs_)[i].dawg_index);
806  if (bdawg->type() == DAWG_TYPE_WORD) {
807  beginning_word_dawgs += (*beginning_active_dawgs_)[i];
808  }
809  }
810  *changed |= kDawgFlag;
811  return new LanguageModelDawgInfo(&(beginning_word_dawgs),
813  COMPOUND_PERM);
814  } // done dealing with compound words
815 
816  LanguageModelDawgInfo *dawg_info = NULL;
817 
818  // Call LetterIsOkay().
819  dict_->LetterIsOkay(dawg_args_, b.unichar_id(), word_end);
820  if (dawg_args_->permuter != NO_PERM) {
821  *changed |= kDawgFlag;
825  }
826 
827  // For non-space delimited languages: since every letter could be
828  // a valid word, a new word could start at every unichar. Thus append
829  // fixed_length_beginning_active_dawgs_ to dawg_info->active_dawgs.
830  if (use_fixed_length_dawgs) {
831  if (dawg_info == NULL) {
832  *changed |= kDawgFlag;
833  dawg_info = new LanguageModelDawgInfo(
835  empty_dawg_info_vec_, SYSTEM_DAWG_PERM);
836  } else {
838  }
839  } // done dealing with fixed-length dawgs
840 
841  return dawg_info;
842 }
843 
845  const char *unichar, float certainty, float denom,
846  int curr_col, int curr_row,
847  const ViterbiStateEntry *parent_vse,
848  BLOB_CHOICE *parent_b,
849  LanguageModelFlagsType *changed) {
850  // Initialize parent context.
851  const char *pcontext_ptr = "";
852  int pcontext_unichar_step_len = 0;
853  if (parent_vse == NULL) {
854  pcontext_ptr = prev_word_str_.string();
855  pcontext_unichar_step_len = prev_word_unichar_step_len_;
856  } else {
857  pcontext_ptr = parent_vse->ngram_info->context.string();
858  pcontext_unichar_step_len =
860  }
861  // Compute p(unichar | parent context).
862  int unichar_step_len = 0;
863  bool pruned = false;
864  float ngram_prob;
865  float ngram_cost = ComputeNgramCost(unichar, certainty, denom,
866  pcontext_ptr, &unichar_step_len,
867  &pruned, &ngram_prob);
868  // First attempt to normalize ngram_cost for strings of different
869  // lengths - we multiply ngram_cost by P(char | context) as many times
870  // as the number of chunks occupied by char. This makes the ngram costs
871  // of all the paths ending at the current BLOB_CHOICE comparable.
872  // TODO(daria): it would be worth looking at different ways of normalization.
873  if (curr_row > curr_col) {
874  ngram_cost += (curr_row - curr_col) * ngram_cost;
875  ngram_prob += (curr_row - curr_col) * ngram_prob;
876  }
877  // Add the ngram_cost of the parent.
878  if (parent_vse != NULL) {
879  ngram_cost += parent_vse->ngram_info->ngram_cost;
880  ngram_prob += parent_vse->ngram_info->ngram_prob;
881  }
882 
883  // Shorten parent context string by unichar_step_len unichars.
884  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
886  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
887  while (num_remove > 0 && *pcontext_ptr != '\0') {
888  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
889  --num_remove;
890  }
891 
892  // Decide whether to prune this ngram path and update changed accordingly.
893  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
894  if (!pruned) *changed |= kNgramFlag;
895 
896  // Construct and return the new LanguageModelNgramInfo.
898  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_prob, ngram_cost);
899  ngram_info->context += unichar;
900  ngram_info->context_unichar_step_len += unichar_step_len;
902  return ngram_info;
903 }
904 
905 float LanguageModel::ComputeNgramCost(const char *unichar,
906  float certainty,
907  float denom,
908  const char *context,
909  int *unichar_step_len,
910  bool *found_small_prob,
911  float *ngram_prob) {
912  const char *context_ptr = context;
913  char *modified_context = NULL;
914  char *modified_context_end = NULL;
915  const char *unichar_ptr = unichar;
916  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
917  float prob = 0.0f;
918  int step = 0;
919  while (unichar_ptr < unichar_end &&
920  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
921  if (language_model_debug_level > 1) {
922  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
923  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
924  }
925  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
926  ++(*unichar_step_len);
928  unichar_ptr += step;
929  // If there are multiple UTF8 characters present in unichar, context is
930  // updated to include the previously examined characters from str,
931  // unless use_only_first_uft8_step is true.
932  if (unichar_ptr < unichar_end) {
933  if (modified_context == NULL) {
934  int context_len = strlen(context);
935  modified_context =
936  new char[context_len + strlen(unichar_ptr) + step + 1];
937  strncpy(modified_context, context, context_len);
938  modified_context_end = modified_context + context_len;
939  context_ptr = modified_context;
940  }
941  strncpy(modified_context_end, unichar_ptr - step, step);
942  modified_context_end += step;
943  *modified_context_end = '\0';
944  }
945  }
946  prob /= static_cast<float>(*unichar_step_len); // normalize
947  if (prob < language_model_ngram_small_prob) {
948  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
949  *found_small_prob = true;
950  }
951  *ngram_prob = -1.0*log(prob);
952  float cost = -1.0*log(CertaintyScore(certainty)/denom) +
953  *ngram_prob * language_model_ngram_scale_factor;
954  if (language_model_debug_level > 1) {
955  tprintf("-log [ p(%s) * p(%s | %s) ] = -log(%g*%g) = %g\n", unichar,
956  unichar, context_ptr, CertaintyScore(certainty)/denom, prob, cost);
957  }
958  if (modified_context != NULL) delete[] modified_context;
959  return cost;
960 }
961 
962 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
963  if (curr_list->empty()) return 1.0f;
964  float denom = 0.0f;
965  int len = 0;
966  BLOB_CHOICE_IT c_it(curr_list);
967  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
968  ASSERT_HOST(c_it.data() != NULL);
969  ++len;
970  denom += CertaintyScore(c_it.data()->certainty());
971  }
972  assert(len != 0);
973  // The ideal situation would be to have the classifier scores for
974  // classifying each position as each of the characters in the unicharset.
975  // Since we can not do this because of speed, we add a very crude estimate
976  // of what these scores for the "missing" classifications would sum up to.
977  denom += (dict_->getUnicharset().size() - len) *
979 
980  return denom;
981 }
982 
984  int curr_col,
985  bool word_end,
986  BLOB_CHOICE *b,
987  ViterbiStateEntry *parent_vse,
988  BLOB_CHOICE *parent_b,
989  CHUNKS_RECORD *chunks_record,
990  LanguageModelConsistencyInfo *consistency_info) {
991  const UNICHARSET &unicharset = dict_->getUnicharset();
992  UNICHAR_ID unichar_id = b->unichar_id();
993  if (parent_vse != NULL) *consistency_info = parent_vse->consistency_info;
994 
995  // Check punctuation validity.
996  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
997  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
998  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
999  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1000  unicharset.get_isdigit(parent_b->unichar_id()))) {
1001  // reset punc_ref for compound words
1002  consistency_info->punc_ref = NO_EDGE;
1003  } else {
1004  UNICHAR_ID pattern_unichar_id =
1005  (unicharset.get_isalpha(unichar_id) ||
1006  unicharset.get_isdigit(unichar_id)) ?
1007  Dawg::kPatternUnicharID : unichar_id;
1008  if (consistency_info->punc_ref == NO_EDGE ||
1009  pattern_unichar_id != Dawg::kPatternUnicharID ||
1010  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1013  consistency_info->punc_ref);
1014  consistency_info->punc_ref =
1015  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1016  node, pattern_unichar_id, word_end) : NO_EDGE;
1017  if (consistency_info->punc_ref == NO_EDGE) {
1018  consistency_info->invalid_punc = true;
1019  }
1020  }
1021  }
1022  }
1023 
1024  // Update case related counters.
1025  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1026  // Reset counters if we are dealing with a compound word.
1027  consistency_info->num_lower = 0;
1028  consistency_info->num_non_first_upper = 0;
1029  }
1030  else if (unicharset.get_islower(unichar_id)) {
1031  consistency_info->num_lower++;
1032  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1033  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1034  consistency_info->num_lower > 0 ||
1035  consistency_info->num_non_first_upper > 0) {
1036  consistency_info->num_non_first_upper++;
1037  }
1038  }
1039 
1040  // Initialize consistency_info->script_id (use script of unichar_id
1041  // if it is not Common, use script id recorded by the parent otherwise).
1042  // Set inconsistent_script to true if the script of the current unichar
1043  // is not consistent with that of the parent.
1044  consistency_info->script_id = unicharset.get_script(unichar_id);
1045  // Hiragana and Katakana can mix with Han.
1047  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1048  consistency_info->script_id == unicharset.hiragana_sid()) ||
1049  (unicharset.katakana_sid() != unicharset.null_sid() &&
1050  consistency_info->script_id == unicharset.katakana_sid())) {
1051  consistency_info->script_id = dict_->getUnicharset().han_sid();
1052  }
1053  }
1054 
1055  if (parent_vse != NULL &&
1056  (parent_vse->consistency_info.script_id !=
1057  dict_->getUnicharset().common_sid())) {
1058  int parent_script_id = parent_vse->consistency_info.script_id;
1059  // If script_id is Common, use script id of the parent instead.
1060  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1061  consistency_info->script_id = parent_script_id;
1062  }
1063  if (consistency_info->script_id != parent_script_id) {
1064  consistency_info->inconsistent_script = true;
1065  }
1066  }
1067 
1068  // Update chartype related counters.
1069  if (unicharset.get_isalpha(unichar_id)) {
1070  consistency_info->num_alphas++;
1071  } else if (unicharset.get_isdigit(unichar_id)) {
1072  consistency_info->num_digits++;
1073  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1074  consistency_info->num_other++;
1075  }
1076 
1077  // Check font and spacing consistency.
1078  if (parent_b != NULL) {
1079  int fontinfo_id = -1;
1080  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1081  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1082  fontinfo_id = b->fontinfo_id();
1083  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1084  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1085  fontinfo_id = b->fontinfo_id2();
1086  }
1087  if(language_model_debug_level > 1) {
1088  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1089  (parent_b->fontinfo_id() >= 0) ?
1090  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1091  (parent_b->fontinfo_id2() >= 0) ?
1092  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1093  (b->fontinfo_id() >= 0) ?
1094  fontinfo_table_->get(b->fontinfo_id()).name : "",
1095  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1096  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1097  fontinfo_id);
1098  }
1099  bool expected_gap_found = false;
1100  float expected_gap;
1101  int temp_gap;
1102  if (fontinfo_id >= 0) { // found a common font
1103  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1104  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1105  expected_gap = temp_gap;
1106  expected_gap_found = true;
1107  }
1108  } else {
1109  consistency_info->inconsistent_font = true;
1110  // Get an average of the expected gaps in each font
1111  int num_addends = 0;
1112  expected_gap = 0;
1113  int temp_fid;
1114  for (int i = 0; i < 4; ++i) {
1115  if (i == 0) {
1116  temp_fid = parent_b->fontinfo_id();
1117  } else if (i == 1) {
1118  temp_fid = parent_b->fontinfo_id2();
1119  } else if (i == 2) {
1120  temp_fid = b->fontinfo_id();
1121  } else {
1122  temp_fid = b->fontinfo_id2();
1123  }
1124  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1125  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1126  expected_gap += temp_gap;
1127  num_addends++;
1128  }
1129  }
1130  expected_gap_found = (num_addends > 0);
1131  if (num_addends > 0) expected_gap /= static_cast<float>(num_addends);
1132  }
1133  if (expected_gap_found) {
1134  float actual_gap =
1135  static_cast<float>(AssociateUtils::GetChunksGap(
1136  chunks_record->chunk_widths, curr_col-1));
1137  float gap_ratio = expected_gap / actual_gap;
1138  // TODO(daria): find a good way to tune this heuristic estimate.
1139  if (gap_ratio < 1/2 || gap_ratio > 2) {
1140  consistency_info->num_inconsistent_spaces++;
1141  }
1142  if(language_model_debug_level > 1) {
1143  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1144  unicharset.id_to_unichar(parent_b->unichar_id()),
1145  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1146  unichar_id, curr_col, expected_gap, actual_gap);
1147  }
1148  }
1149  }
1150 }
1151 
1153  float ratings_sum, int length, float dawg_score,
1154  const LanguageModelDawgInfo *dawg_info,
1155  const LanguageModelNgramInfo *ngram_info,
1156  const LanguageModelConsistencyInfo &consistency_info,
1157  const AssociateStats &associate_stats,
1158  ViterbiStateEntry *parent_vse) {
1159  float adjustment = 1.0f;
1160  if (dawg_info == NULL || dawg_info->permuter != FREQ_DAWG_PERM) {
1162  }
1163  if (dawg_score == 0.0f) {
1165  if (length > language_model_min_compound_length) {
1166  adjustment += ((length - language_model_min_compound_length) *
1168  }
1169  } else if (dawg_score < 1.0f) {
1170  adjustment += (1.0f - dawg_score) * language_model_penalty_non_dict_word;
1171  }
1172  if (associate_stats.shape_cost > 0) {
1173  adjustment += associate_stats.shape_cost / static_cast<float>(length);
1174  }
1176  ASSERT_HOST(ngram_info != NULL);
1177  return ngram_info->ngram_cost * adjustment;
1178  } else {
1179  adjustment += ComputeConsistencyAdjustment(dawg_info, consistency_info);
1180  return ratings_sum * adjustment;
1181  }
1182 }
1183 
1185  BLOB_CHOICE *b,
1186  ViterbiStateEntry *vse,
1187  HEAP *pain_points,
1188  CHUNKS_RECORD *chunks_record,
1189  BestChoiceBundle *best_choice_bundle,
1190  BlamerBundle *blamer_bundle) {
1191  int i;
1192  BLOB_CHOICE_LIST_VECTOR temp_best_char_choices(vse->length);
1193  for (i = 0; i < vse->length; ++i) {
1194  temp_best_char_choices.push_back(NULL);
1195  }
1196  float *certainties = new float[vse->length];
1197  STATE temp_state;
1198  // The fraction of letters in the path that are "covered" by dawgs.
1199  // For space delimited languages this number will be 0.0 for nonwords
1200  // and 1.0 for dictionary words. For non-space delimited languages
1201  // this number will be in the [0.0, 1.0] range.
1202  float dawg_score;
1203  bool truth_path;
1204  WERD_CHOICE *word = ConstructWord(b, vse, chunks_record,
1205  &temp_best_char_choices, certainties,
1206  &dawg_score, &temp_state,
1207  blamer_bundle, &truth_path);
1208  ASSERT_HOST(word != NULL);
1209  bool not_blaming =
1210  (blamer_bundle == NULL || !blamer_bundle->segsearch_is_looking_for_blame);
1211 
1212  // Log new segmentation (for dict_->LogNewChoice()).
1213  if (not_blaming) {
1214  PIECES_STATE pieces_widths;
1215  bin_to_pieces(&temp_state, chunks_record->ratings->dimension() - 1,
1216  pieces_widths);
1217  dict_->LogNewSegmentation(pieces_widths);
1218  }
1219 
1220  if (language_model_debug_level > 0) {
1221  STRING word_str;
1222  word->string_and_lengths(&word_str, NULL);
1223  tprintf("UpdateBestChoice() constructed word %s\n", word_str.string());
1224  if (language_model_debug_level > 2) word->print();
1225  };
1226 
1227  // Update raw_choice if needed.
1228  if ((vse->top_choice_flags & kSmallestRatingFlag) &&
1229  word->rating() < best_choice_bundle->raw_choice->rating() &&
1230  not_blaming) {
1231  dict_->LogNewChoice(1.0, certainties, true, word, temp_best_char_choices);
1232  *(best_choice_bundle->raw_choice) = *word;
1233  best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
1234  if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
1235  }
1236 
1237  // When working with non-space delimited languages we re-adjust the cost
1238  // taking into account the final dawg_score and a more precise shape cost.
1239  // While constructing the paths we assume that they are all dictionary words
1240  // (since a single character would be a valid dictionary word). At the end
1241  // we compute dawg_score which reflects how many characters on the path are
1242  // "covered" by dictionary words of length > 1.
1243  if (vse->associate_stats.full_wh_ratio_var != 0.0f ||
1244  (dict_->GetMaxFixedLengthDawgIndex() >= 0 && dawg_score < 1.0f)) {
1246  vse->ratings_sum, vse->length, dawg_score, vse->dawg_info,
1247  vse->ngram_info, vse->consistency_info, vse->associate_stats,
1248  vse->parent_vse);
1249  if (language_model_debug_level > 0) {
1250  tprintf("Updated vse cost to %g (dawg_score %g full_wh_ratio_var %g)\n",
1251  vse->cost, dawg_score, vse->associate_stats.full_wh_ratio_var);
1252  }
1253  }
1254 
1255  // Update best choice and best char choices if needed.
1256  // TODO(daria): re-write AcceptableChoice() and NoDangerousAmbig()
1257  // to fit better into the new segmentation search.
1258  word->set_rating(vse->cost);
1259  if (word->rating() < best_choice_bundle->best_choice->rating() &&
1260  not_blaming) {
1262  vse->ngram_info->ngram_cost :
1263  vse->ratings_sum),
1264  certainties, false, word, temp_best_char_choices);
1265  // Since the rating of the word could have been modified by
1266  // Dict::LogNewChoice() - check again.
1267  if (word->rating() < best_choice_bundle->best_choice->rating()) {
1268  bool modified_blobs; // not used
1269  DANGERR fixpt;
1270  if (dict_->AcceptableChoice(&temp_best_char_choices, word, &fixpt,
1271  ASSOCIATOR_CALLER, &modified_blobs) &&
1272  AcceptablePath(*vse)) {
1273  acceptable_choice_found_ = true;
1274  }
1275  // Update best_choice_bundle.
1276  *(best_choice_bundle->best_choice) = *word;
1277  best_choice_bundle->updated = true;
1278  best_choice_bundle->best_char_choices->delete_data_pointers();
1279  best_choice_bundle->best_char_choices->clear();
1280  for (i = 0; i < temp_best_char_choices.size(); ++i) {
1281  BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
1282  cc_list->deep_copy(temp_best_char_choices[i], &BLOB_CHOICE::deep_copy);
1283  best_choice_bundle->best_char_choices->push_back(cc_list);
1284  }
1285  best_choice_bundle->best_state->part2 = temp_state.part2;
1286  best_choice_bundle->best_state->part1 = temp_state.part1;
1287  if (language_model_debug_level > 0) {
1288  tprintf("Updated best choice\n");
1289  print_state("New state ", best_choice_bundle->best_state,
1290  chunks_record->ratings->dimension()-1);
1291  }
1292  // Update hyphen state if we are dealing with a dictionary word.
1293  if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
1294  if (dict_->has_hyphen_end(*word)) {
1296  *(dawg_args_->constraints));
1297  } else {
1298  dict_->reset_hyphen_vars(true);
1299  }
1300  }
1301  best_choice_bundle->best_vse = vse;
1302  best_choice_bundle->best_b = b;
1303  best_choice_bundle->fixpt = fixpt;
1304 
1305  if (blamer_bundle != NULL) {
1306  blamer_bundle->best_choice_is_dict_and_top_choice =
1307  (vse->dawg_info != NULL &&
1309  (vse->top_choice_flags));
1310  }
1311  }
1312  }
1313  if (blamer_bundle != NULL) {
1314  // Record the current hypothesis in params_training_bundle.
1316  blamer_bundle->params_training_bundle.AddHypothesis();
1317  word->string_and_lengths(&(hyp.str), NULL);
1319  if (truth_path && // update best truth path rating
1320  word->rating() < blamer_bundle->best_correctly_segmented_rating) {
1321  blamer_bundle->best_correctly_segmented_rating = word->rating();
1322  }
1323  }
1324 
1325  // Clean up.
1326  delete[] certainties;
1327  delete word;
1328 }
1329 
1331  float *features) {
1332  for (int f = 0; f < PTRAIN_NUM_RAW_FEATURE_TYPES; ++f) features[f] = 0.0;
1333  // Dictionary-related features.
1334  if (vse.dawg_info != NULL) {
1336 
1337  // Mark as unambiguous if unambig_dawg is found among active dawgs.
1338  for (int d = 0; d < vse.dawg_info->active_dawgs->size(); ++d) {
1339  if (dict_->GetDawg(vse.dawg_info->active_dawgs->get(d).dawg_index) ==
1340  dict_->GetUnambigDawg()) {
1341  features[PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH] = 1.0;
1342  break;
1343  }
1344  }
1345  }
1346  if (vse.associate_stats.shape_cost > 0) {
1348  }
1350  ASSERT_HOST(vse.ngram_info != NULL);
1352  }
1353  // Consistency-related features.
1366  // Classifier-related features.
1368  features[PTRAIN_RAW_FEATURE_RATING] = vse.ratings_sum;
1369  features[PTRAIN_RAW_FEATURE_ADAPTED] = vse.adapted;
1370  // Normalization-related features.
1371  features[PTRAIN_RAW_FEATURE_NUM_UNICHARS] = vse.length;
1373 }
1374 
1376  BLOB_CHOICE *b,
1377  ViterbiStateEntry *vse,
1378  CHUNKS_RECORD *chunks_record,
1379  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
1380  float certainties[],
1381  float *dawg_score,
1382  STATE *state,
1383  BlamerBundle *blamer_bundle,
1384  bool *truth_path) {
1385  uinT64 state_uint = 0x0;
1386  if (truth_path != NULL) {
1387  *truth_path =
1388  (blamer_bundle != NULL &&
1389  !blamer_bundle->correct_segmentation_cols.empty() &&
1390  vse->length == blamer_bundle->correct_segmentation_cols.length());
1391  }
1392  const uinT64 kHighestBitOn = 0x8000000000000000LL;
1393  BLOB_CHOICE *curr_b = b;
1394  LanguageModelState *curr_lms =
1395  reinterpret_cast<LanguageModelState *>(b->language_model_state());
1396  ViterbiStateEntry *curr_vse = vse;
1397 
1398  int i;
1399  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1400  bool dawg_score_done = true;
1401  if (dawg_score != NULL) {
1402  *dawg_score = 0.0f;
1403  // For space-delimited languages the presence of dawg_info in the last
1404  // ViterbyStateEntry on the path means that the whole path represents
1405  // a valid dictionary word.
1406  if (dict_->GetMaxFixedLengthDawgIndex() < 0) {
1407  if (vse->dawg_info != NULL) *dawg_score = 1.0f;
1408  } else if (vse->length == 1) {
1409  *dawg_score = 1.0f; // each one-letter word is legal
1410  dawg_score_done = true; // in non-space delimited languages
1411  } else {
1412  dawg_score_done = false; // do more work to compute dawg_score
1413  }
1414  }
1415  // For non-space delimited languages we compute the fraction of letters
1416  // "covered" by fixed length dawgs (i.e. words of length > 1 on the path).
1417  int covered_by_fixed_length_dawgs = 0;
1418  // The number of unichars remaining that should be skipped because
1419  // they are covered by the previous word from fixed length dawgs.
1420  int fixed_length_num_unichars_to_skip = 0;
1421 
1422  // Re-compute the variance of the width-to-height ratios (since we now
1423  // can compute the mean over the whole word).
1424  float full_wh_ratio_mean = 0.0f;
1425  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1427  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1428  static_cast<float>(vse->length));
1429  vse->associate_stats.full_wh_ratio_var = 0.0f;
1430  }
1431 
1432  // Construct a WERD_CHOICE by tracing parent pointers.
1433  WERD_CHOICE *word = new WERD_CHOICE(chunks_record->word_res->uch_set,
1434  vse->length);
1435  word->set_length(vse->length);
1436  for (i = (vse->length-1); i >= 0; --i) {
1437  if (blamer_bundle != NULL && truth_path != NULL && *truth_path) {
1438  if ((blamer_bundle->correct_segmentation_cols[i] !=
1439  curr_lms->contained_in_col) ||
1440  (blamer_bundle->correct_segmentation_rows[i] !=
1441  curr_lms->contained_in_row)) {
1442  *truth_path = false;
1443  }
1444  }
1445  word->set_unichar_id(curr_b->unichar_id(), i);
1446  word->set_fragment_length(1, i);
1447  if (certainties != NULL) certainties[i] = curr_b->certainty();
1448  if (best_char_choices != NULL) {
1449  best_char_choices->set(chunks_record->ratings->get(
1450  curr_lms->contained_in_col, curr_lms->contained_in_row), i);
1451  }
1452  if (state != NULL) {
1453  // Record row minus col zeroes in the reverse state to mark the number
1454  // of joins done by using a blob from this cell in the ratings matrix.
1455  state_uint >>= (curr_lms->contained_in_row - curr_lms->contained_in_col);
1456  // Record a one in the reverse state to indicate the split before
1457  // the blob from the next cell in the ratings matrix (unless we are
1458  // at the first cell, in which case there is no next blob).
1459  if (i != 0) {
1460  state_uint >>= 1;
1461  state_uint |= kHighestBitOn;
1462  }
1463  }
1464  // For non-space delimited languages: find word endings recorded while
1465  // trying to separate the current path into words (for words found in
1466  // fixed length dawgs.
1467  if (!dawg_score_done && curr_vse->dawg_info != NULL) {
1469  i, vse->length,
1470  &fixed_length_num_unichars_to_skip,
1471  &covered_by_fixed_length_dawgs,
1472  dawg_score, &dawg_score_done);
1473  }
1474  // Update the width-to-height ratio variance. Useful non-space delimited
1475  // languages to ensure that the blobs are of uniform width.
1476  // Skip leading and trailing punctuation when computing the variance.
1477  if ((full_wh_ratio_mean != 0.0f &&
1478  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1479  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1481  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1482  if (language_model_debug_level > 2) {
1483  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1484  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1485  }
1486  }
1487 
1488  // Mark the word as compound if compound permuter was set for any of
1489  // the unichars on the path (usually this will happen for unichars
1490  // that are compounding operators, like "-" and "/").
1491  if (!compound && curr_vse->dawg_info &&
1492  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1493 
1494  // Update curr_* pointers.
1495  if (curr_vse->parent_b == NULL) break;
1496  curr_b = curr_vse->parent_b;
1497  curr_lms =
1498  reinterpret_cast<LanguageModelState *>(curr_b->language_model_state());
1499  curr_vse = curr_vse->parent_vse;
1500  }
1501  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids
1502  // Re-adjust shape cost to include the updated width-to-height variance.
1503  if (full_wh_ratio_mean != 0.0f) {
1505  }
1506 
1507  if (state != NULL) {
1508  state_uint >>= (64 - (chunks_record->ratings->dimension()-1));
1509  state->part2 = state_uint;
1510  state_uint >>= 32;
1511  state->part1 = state_uint;
1512  }
1513  word->set_rating(vse->ratings_sum);
1514  word->set_certainty(vse->min_certainty);
1515  if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
1516  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1517  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1518  word->set_permuter(NGRAM_PERM);
1519  } else if (vse->top_choice_flags) {
1520  word->set_permuter(TOP_CHOICE_PERM);
1521  } else {
1522  word->set_permuter(NO_PERM);
1523  }
1524  return word;
1525 }
1526 
1528  const DawgInfoVector &active_dawgs, int word_index, int word_length,
1529  int *skip, int *covered, float *dawg_score, bool *dawg_score_done) {
1530  if (language_model_debug_level > 3) {
1531  tprintf("UpdateCoveredByFixedLengthDawgs for index %d skip=%d\n",
1532  word_index, *skip, word_length);
1533  }
1534 
1535  if (*skip > 0) {
1536  --(*skip);
1537  } else {
1538  int best_index = -1;
1539  for (int d = 0; d < active_dawgs.size(); ++d) {
1540  int dawg_index = (active_dawgs[d]).dawg_index;
1541  if (dawg_index > dict_->GetMaxFixedLengthDawgIndex()) {
1542  // If active_dawgs of the last ViterbiStateEntry on the path
1543  // contain a non-fixed length dawg, this means that the whole
1544  // path represents a word from a non-fixed length word dawg.
1545  if (word_index == (word_length - 1)) {
1546  *dawg_score = 1.0f;
1547  *dawg_score_done = true;
1548  return;
1549  }
1550  } else if (dawg_index >= kMinFixedLengthDawgLength) {
1551  const Dawg *curr_dawg = dict_->GetDawg(dawg_index);
1552  ASSERT_HOST(curr_dawg != NULL);
1553  if ((active_dawgs[d]).ref != NO_EDGE &&
1554  curr_dawg->end_of_word((active_dawgs[d]).ref) &&
1555  dawg_index > best_index) {
1556  best_index = dawg_index;
1557  }
1558 
1559  if (language_model_debug_level > 3) {
1560  tprintf("dawg_index %d, ref %d, eow %d\n", dawg_index,
1561  (active_dawgs[d]).ref,
1562  ((active_dawgs[d]).ref != NO_EDGE &&
1563  curr_dawg->end_of_word((active_dawgs[d]).ref)));
1564  }
1565  }
1566  } // end for
1567  if (best_index != -1) {
1568  *skip = best_index - 1;
1569  *covered += best_index;
1570  }
1571  } // end if/else skip
1572 
1573  if (word_index == 0) {
1574  ASSERT_HOST(*covered <= word_length);
1575  *dawg_score = (static_cast<float>(*covered) /
1576  static_cast<float>(word_length));
1577  *dawg_score_done = true;
1578  }
1579 }
1580 
1582  int col,
1583  const GenericVector<int> &non_empty_rows,
1584  float best_choice_cert,
1585  HEAP *pain_points,
1586  BestPathByColumn *best_path_by_column[],
1587  CHUNKS_RECORD *chunks_record) {
1588  for (int i = 0; i < non_empty_rows.length(); ++i) {
1589  int row = non_empty_rows[i];
1590  if (language_model_debug_level > 0) {
1591  tprintf("\nLooking for pain points in col=%d row=%d\n", col, row);
1592  }
1595  col, row, pain_points, chunks_record);
1596  } else {
1598  col, row, best_choice_cert, pain_points,
1599  best_path_by_column, chunks_record);
1600  }
1601  }
1602 }
1603 
1605  int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record) {
1606  // Find the first top choice path recorded for this cell.
1607  // If this path is pruned - generate a pain point.
1608  ASSERT_HOST(chunks_record->ratings->get(col, row) != NULL);
1609  BLOB_CHOICE_IT bit(chunks_record->ratings->get(col, row));
1610  bool fragmented = false;
1611  for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
1612  if (dict_->getUnicharset().get_fragment(bit.data()->unichar_id())) {
1613  fragmented = true;
1614  continue;
1615  }
1616  LanguageModelState *lms = reinterpret_cast<LanguageModelState *>(
1617  bit.data()->language_model_state());
1618  if (lms == NULL) continue;
1619  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
1620  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
1621  const ViterbiStateEntry *vse = vit.data();
1622  if (!vse->top_choice_flags) continue;
1623  ASSERT_HOST(vse->ngram_info != NULL);
1624  if (vse->ngram_info->pruned && (vse->parent_vse == NULL ||
1625  !vse->parent_vse->ngram_info->pruned)) {
1626  if (vse->parent_vse != NULL) {
1627  LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
1628  vse->parent_b->language_model_state());
1629  GeneratePainPoint(pp_lms->contained_in_col, row, false,
1631  -1.0f, fragmented, -1.0f,
1633  vse->parent_vse->parent_b,
1634  vse->parent_vse->parent_vse,
1635  chunks_record, pain_points);
1636  }
1637  if (vse->parent_vse != NULL &&
1638  vse->parent_vse->parent_vse != NULL &&
1640  vse->parent_b->unichar_id())) {
1641  // If the dip in the ngram probability is due to punctuation in the
1642  // middle of the word - go two unichars back to combine this
1643  // punctuation mark with the previous character on the path.
1644  LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
1646  GeneratePainPoint(pp_lms->contained_in_col, col-1, false,
1648  -1.0f, fragmented, -1.0f,
1652  chunks_record, pain_points);
1653  } else if (row+1 < chunks_record->ratings->dimension()) {
1654  GeneratePainPoint(col, row+1, true,
1656  -1.0f, fragmented, -1.0f,
1658  vse->parent_b,
1659  vse->parent_vse,
1660  chunks_record, pain_points);
1661  }
1662  }
1663  return; // examined the lowest cost top choice path
1664  }
1665  }
1666 }
1667 
1669  int col, int row, float best_choice_cert,
1670  HEAP *pain_points, BestPathByColumn *best_path_by_column[],
1671  CHUNKS_RECORD *chunks_record) {
1672  MATRIX *ratings = chunks_record->ratings;
1673 
1674  // Get the best path from this matrix cell.
1675  BLOB_CHOICE_LIST *blist = ratings->get(col, row);
1676  ASSERT_HOST(blist != NULL);
1677  if (blist->empty()) return;
1678  BLOB_CHOICE_IT bit(blist);
1679  bool fragment = false;
1680  while (dict_->getUnicharset().get_fragment(bit.data()->unichar_id()) &&
1681  !bit.at_last()) { // skip fragments
1682  fragment = true;
1683  bit.forward();
1684  }
1685  if (bit.data()->language_model_state() == NULL) return;
1686  ViterbiStateEntry_IT vit(&(reinterpret_cast<LanguageModelState *>(
1687  bit.data()->language_model_state())->viterbi_state_entries));
1688  if (vit.empty()) return;
1689  ViterbiStateEntry *vse = vit.data();
1690  // Check whether this path is promising.
1691  bool path_is_promising = true;
1692  if (vse->parent_vse != NULL) {
1693  float potential_avg_cost =
1694  ((vse->parent_vse->cost + bit.data()->rating()*0.5f) /
1695  static_cast<float>(row+1));
1696  if (language_model_debug_level > 0) {
1697  tprintf("potential_avg_cost %g best cost %g\n",
1698  potential_avg_cost, (*best_path_by_column)[col].avg_cost);
1699  }
1700  if (potential_avg_cost >= (*best_path_by_column)[col].avg_cost) {
1701  path_is_promising = false;
1702  }
1703  }
1704  // Set best_parent_vse to the best parent recorded in best_path_by_column.
1705  ViterbiStateEntry *best_parent_vse = vse->parent_vse;
1706  BLOB_CHOICE *best_parent_b = vse->parent_b;
1707  if (col > 0 && (*best_path_by_column)[col-1].best_vse != NULL) {
1708  ASSERT_HOST((*best_path_by_column)[col-1].best_b != NULL);
1709  LanguageModelState *best_lms = reinterpret_cast<LanguageModelState *>(
1710  ((*best_path_by_column)[col-1].best_b)->language_model_state());
1711  if (best_lms->contained_in_row == col-1) {
1712  best_parent_vse = (*best_path_by_column)[col-1].best_vse;
1713  best_parent_b = (*best_path_by_column)[col-1].best_b;
1714  if (language_model_debug_level > 0) {
1715  tprintf("Setting best_parent_vse to %p\n", best_parent_vse);
1716  }
1717  }
1718  }
1719  // Check whether this entry terminates the best parent path
1720  // recorded in best_path_by_column.
1721  bool best_not_prolonged = (best_parent_vse != vse->parent_vse);
1722 
1723  // If this path is problematic because of the last unichar - generate
1724  // a pain point to combine it with its left and right neighbor.
1725  BLOB_CHOICE_IT tmp_bit;
1726  if (best_not_prolonged ||
1727  (path_is_promising &&
1728  ProblematicPath(*vse, bit.data()->unichar_id(),
1729  row+1 == ratings->dimension()))) {
1730  float worst_piece_cert;
1731  bool fragmented;
1732  if (col-1 > 0) {
1733  GetWorstPieceCertainty(col-1, row, chunks_record->ratings,
1734  &worst_piece_cert, &fragmented);
1735  GeneratePainPoint(col-1, row, false,
1737  worst_piece_cert, fragmented, best_choice_cert,
1738  max_char_wh_ratio_, best_parent_b, best_parent_vse,
1739  chunks_record, pain_points);
1740  }
1741  if (row+1 < ratings->dimension()) {
1742  GetWorstPieceCertainty(col, row+1, chunks_record->ratings,
1743  &worst_piece_cert, &fragmented);
1745  worst_piece_cert, fragmented, best_choice_cert,
1746  max_char_wh_ratio_, best_parent_b, best_parent_vse,
1747  chunks_record, pain_points);
1748  }
1749  } // for ProblematicPath
1750 }
1751 
1753  HEAP *pain_points,
1754  CHUNKS_RECORD *chunks_record,
1755  BestChoiceBundle *best_choice_bundle) {
1756  // Variables to backtrack best_vse path;
1757  ViterbiStateEntry *curr_vse = best_choice_bundle->best_vse;
1758  BLOB_CHOICE *curr_b = best_choice_bundle->best_b;
1759 
1760  // Begins and ends in DANGERR vector record the positions in the blob choice
1761  // list of the best choice. We need to translate these endpoints into the
1762  // beginning column and ending row for the pain points. We maintain
1763  // danger_begin and danger_end arrays indexed by the position in
1764  // best_choice_bundle->best_char_choices (which is equal to the position
1765  // on the best_choice_bundle->best_vse path).
1766  // danger_end[d] stores the DANGERR_INFO structs with end==d and is
1767  // initialized at the beginning of this function.
1768  // danger_begin[d] stores the DANGERR_INFO struct with begin==d and
1769  // has end set to the row of the end of this ambiguity.
1770  // The translation from end in terms of the best choice index to the end row
1771  // is done while following the parents of best_choice_bundle->best_vse.
1772  assert(best_choice_bundle->best_char_choices->length() ==
1773  best_choice_bundle->best_vse->length);
1774  DANGERR *danger_begin = NULL;
1775  DANGERR *danger_end = NULL;
1776  int d;
1777  if (!best_choice_bundle->fixpt.empty()) {
1778  danger_begin = new DANGERR[best_choice_bundle->best_vse->length];
1779  danger_end = new DANGERR[best_choice_bundle->best_vse->length];
1780  for (d = 0; d < best_choice_bundle->fixpt.size(); ++d) {
1781  const DANGERR_INFO &danger = best_choice_bundle->fixpt[d];
1782  // Only use n->1 ambiguities.
1783  if (danger.end > danger.begin && !danger.correct_is_ngram &&
1784  (!language_model_ngram_on || danger.dangerous)) {
1785  danger_end[danger.end].push_back(danger);
1786  }
1787  }
1788  }
1789 
1790  // Variables to keep track of punctuation/number streaks.
1791  int punc_streak_end_row = -1;
1792  int punc_streak_length = 0;
1793  float punc_streak_min_cert = 0.0f;
1794 
1795  if (language_model_debug_level > 0) {
1796  tprintf("\nGenerating pain points for best path=%p\n", curr_vse);
1797  }
1798 
1799  int word_index = best_choice_bundle->best_vse->length;
1800  while (curr_vse != NULL) {
1801  word_index--;
1802  ASSERT_HOST(word_index >= 0);
1803  ASSERT_HOST(curr_b != NULL);
1804  if (language_model_debug_level > 0) {
1805  tprintf("Looking at unichar %s\n",
1807  }
1808 
1809  int pp_col = reinterpret_cast<LanguageModelState *>(
1810  curr_b->language_model_state())->contained_in_col;
1811  int pp_row = reinterpret_cast<LanguageModelState *>(
1812  curr_b->language_model_state())->contained_in_row;
1813 
1814  // Generate pain points for ambiguities found by NoDangerousAmbig().
1815  if (danger_end != NULL) {
1816  // Translate end index of an ambiguity to an end row.
1817  for (d = 0; d < danger_end[word_index].size(); ++d) {
1818  danger_end[word_index][d].end = pp_row;
1819  danger_begin[danger_end[word_index][d].begin].push_back(
1820  danger_end[word_index][d]);
1821  }
1822  // Generate a pain point to combine unchars in the "wrong" part
1823  // of the ambiguity.
1824  for (d = 0; d < danger_begin[word_index].size(); ++d) {
1825  if (language_model_debug_level > 0) {
1826  tprintf("Generating pain point from %sambiguity\n",
1827  danger_begin[word_index][d].dangerous ? "dangerous " : "");
1828  }
1829  GeneratePainPoint(pp_col, danger_begin[word_index][d].end, false,
1830  danger_begin[word_index][d].dangerous ?
1833  best_choice_bundle->best_choice->certainty(), true,
1834  best_choice_bundle->best_choice->certainty(),
1836  curr_vse->parent_b, curr_vse->parent_vse,
1837  chunks_record, pain_points);
1838  }
1839  }
1840 
1841  if (!language_model_ngram_on) { // no need to use further heuristics if we
1842  // are guided by the character ngram model
1843  // Generate pain points for problematic sub-paths.
1844  if (ProblematicPath(*curr_vse, curr_b->unichar_id(),
1845  pp_row+1 == chunks_record->ratings->dimension())) {
1846  if (language_model_debug_level > 0) {
1847  tprintf("Generating pain point from a problematic sub-path\n");
1848  }
1849  float worst_piece_cert;
1850  bool fragment;
1851  if (pp_col > 0) {
1852  GetWorstPieceCertainty(pp_col-1, pp_row, chunks_record->ratings,
1853  &worst_piece_cert, &fragment);
1854  GeneratePainPoint(pp_col-1, pp_row, false,
1856  worst_piece_cert, true,
1857  best_choice_bundle->best_choice->certainty(),
1859  chunks_record, pain_points);
1860  }
1861  if (pp_row+1 < chunks_record->ratings->dimension()) {
1862  GetWorstPieceCertainty(pp_col, pp_row+1, chunks_record->ratings,
1863  &worst_piece_cert, &fragment);
1864  GeneratePainPoint(pp_col, pp_row+1, true,
1866  worst_piece_cert, true,
1867  best_choice_bundle->best_choice->certainty(),
1869  chunks_record, pain_points);
1870  }
1871  }
1872 
1873  // Generate a pain point if we encountered a punctuation/number streak to
1874  // combine all punctuation marks into one blob and try to classify it.
1875  bool is_alpha = dict_->getUnicharset().get_isalpha(curr_b->unichar_id());
1876  if (!is_alpha) {
1877  if (punc_streak_end_row == -1) punc_streak_end_row = pp_row;
1878  punc_streak_length++;
1879  if (curr_b->certainty() < punc_streak_min_cert)
1880  punc_streak_min_cert = curr_b->certainty();
1881  }
1882  if (is_alpha || curr_vse->parent_vse == NULL) {
1883  if (punc_streak_end_row != -1 && punc_streak_length > 1) {
1884  if (language_model_debug_level > 0) {
1885  tprintf("Generating pain point from a punctuation streak\n");
1886  }
1887  if (is_alpha ||
1888  (curr_vse->parent_vse == NULL && punc_streak_length > 2)) {
1889  GeneratePainPoint(pp_row+1, punc_streak_end_row, false,
1891  punc_streak_min_cert, true,
1892  best_choice_bundle->best_choice->certainty(),
1893  max_char_wh_ratio_, curr_b, curr_vse,
1894  chunks_record, pain_points);
1895  }
1896  // Handle punctuation/number streaks starting from the first unichar.
1897  if (curr_vse->parent_vse == NULL) {
1898  GeneratePainPoint(0, punc_streak_end_row, false,
1900  punc_streak_min_cert, true,
1901  best_choice_bundle->best_choice->certainty(),
1903  chunks_record, pain_points);
1904  }
1905  }
1906  punc_streak_end_row = -1;
1907  punc_streak_length = 0;
1908  punc_streak_min_cert = 0.0f;
1909  } // end handling punctuation streaks
1910  }
1911 
1912  curr_b = curr_vse->parent_b;
1913  curr_vse = curr_vse->parent_vse;
1914  } // end looking at best choice subpaths
1915 
1916  // Clean up.
1917  if (danger_end != NULL) {
1918  delete[] danger_begin;
1919  delete[] danger_end;
1920  }
1921 }
1922 
1924  int col, int row, bool ok_to_extend, float priority,
1925  float worst_piece_cert, bool fragmented, float best_choice_cert,
1926  float max_char_wh_ratio,
1927  BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse,
1928  CHUNKS_RECORD *chunks_record, HEAP *pain_points) {
1929  if (col < 0 || row >= chunks_record->ratings->dimension() ||
1930  chunks_record->ratings->get(col, row) != NOT_CLASSIFIED) {
1931  return false;
1932  }
1933  if (language_model_debug_level > 3) {
1934  tprintf("\nGenerating pain point for col=%d row=%d priority=%g parent=",
1935  col, row, priority);
1936  if (parent_vse != NULL) {
1937  PrintViterbiStateEntry("", parent_vse, parent_b, chunks_record);
1938  } else {
1939  tprintf("NULL");
1940  }
1941  tprintf("\n");
1942  }
1943 
1944  AssociateStats associate_stats;
1945  ComputeAssociateStats(col, row, max_char_wh_ratio, parent_vse,
1946  chunks_record, &associate_stats);
1947  // For fixed-pitch fonts/languages: if the current combined blob overlaps
1948  // the next blob on the right and it is ok to extend the blob, try expending
1949  // the blob until there is no overlap with the next blob on the right or
1950  // until the width-to-height ratio becomes too large.
1951  if (ok_to_extend) {
1952  while (associate_stats.bad_fixed_pitch_right_gap &&
1953  row+1 < chunks_record->ratings->dimension() &&
1954  !associate_stats.bad_fixed_pitch_wh_ratio) {
1955  ComputeAssociateStats(col, ++row, max_char_wh_ratio, parent_vse,
1956  chunks_record, &associate_stats);
1957  }
1958  }
1959 
1960  if (associate_stats.bad_shape) {
1961  if (language_model_debug_level > 3) {
1962  tprintf("Discarded pain point with a bad shape\n");
1963  }
1964  return false;
1965  }
1966 
1967  // Compute pain point priority.
1968  if (associate_stats.shape_cost > 0) {
1969  priority *= associate_stats.shape_cost;
1970  }
1971  if (worst_piece_cert < best_choice_cert) {
1972  worst_piece_cert = best_choice_cert;
1973  }
1974  priority *= CertaintyScore(worst_piece_cert);
1975  if (fragmented) priority /= kDefaultPainPointPriorityAdjustment;
1976 
1977  if (language_model_debug_level > 3) {
1978  tprintf("worst_piece_cert=%g fragmented=%d\n",
1979  worst_piece_cert, fragmented);
1980  }
1981 
1982  if (parent_vse != NULL) {
1983  priority *= sqrt(parent_vse->cost / static_cast<float>(col));
1984  if (parent_vse->dawg_info != NULL) {
1986  if (parent_vse->length > language_model_min_compound_length) {
1987  priority /= sqrt(static_cast<double>(parent_vse->length));
1988  }
1989  }
1990  }
1991 
1992  MATRIX_COORD *pain_point = new MATRIX_COORD(col, row);
1993  if (HeapPushCheckSize(pain_points, priority, pain_point)) {
1995  tprintf("Added pain point with priority %g\n", priority);
1996  }
1997  return true;
1998  } else {
1999  delete pain_point;
2000  if (language_model_debug_level) tprintf("Pain points heap is full\n");
2001  return false;
2002  }
2003 }
2004 
2005 } // namespace tesseract