Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::LanguageModel Class Reference

#include <language_model.h>

List of all members.

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 ~LanguageModel ()
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
void CleanUp ()
void DeleteState (BLOB_CHOICE_LIST *choices)
LanguageModelFlagsType UpdateState (LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void GenerateNgramModelPainPointsFromColumn (int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record)
void GenerateProblematicPathPainPointsFromColumn (int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void GeneratePainPointsFromColumn (int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void GeneratePainPointsFromBestChoice (HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
bool GeneratePainPoint (int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
bool AcceptableChoiceFound ()
void GetWorstPieceCertainty (int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
float ComputeOutlineLength (BLOB_CHOICE *b)

Public Attributes

int language_model_debug_level = 0
bool language_model_ngram_on = false
int language_model_ngram_order = 8
int language_model_viterbi_list_max_num_prunable = 10
int language_model_viterbi_list_max_size = 500
double language_model_ngram_small_prob = 0.000001
double language_model_ngram_nonmatch_score = -40.0
bool language_model_ngram_use_only_first_uft8_step = false
double language_model_ngram_scale_factor = 0.03
bool language_model_ngram_space_delimited_language = true
int language_model_min_compound_length = 3
int language_model_fixed_length_choices_depth = 3
double language_model_penalty_non_freq_dict_word = 0.1
double language_model_penalty_non_dict_word = 0.15
double language_model_penalty_punc = 0.2
double language_model_penalty_case = 0.1
double language_model_penalty_script = 0.5
double language_model_penalty_chartype = 0.3
double language_model_penalty_font = 0.00
double language_model_penalty_spacing = 0.05
double language_model_penalty_increment = 0.01
bool language_model_use_sigmoidal_certainty = false

Static Public Attributes

static const float kInitialPainPointPriorityAdjustment = 5.0f
static const float kDefaultPainPointPriorityAdjustment = 2.0f
static const float kBestChoicePainPointPriorityAdjustment = 0.5f
static const float kCriticalPainPointPriorityAdjustment = 0.1f
static const float kMaxAvgNgramCost = 25.0f
static const int kMinFixedLengthDawgLength = 2
static const float kLooseMaxCharWhRatio = 2.5f
static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
static const LanguageModelFlagsType kConsistentFlag = 0x8
static const LanguageModelFlagsType kDawgFlag = 0x10
static const LanguageModelFlagsType kNgramFlag = 0x20
static const LanguageModelFlagsType kJustClassifiedFlag = 0x80
static const LanguageModelFlagsType kAllChangedFlag = 0xff

Protected Member Functions

float CertaintyScore (float cert)
bool NonAlphaOrDigitMiddle (int col, int row, int dimension, UNICHAR_ID unichar_id)
bool IsFragment (BLOB_CHOICE *b)
bool IsHan (int script_id)
void GetPieceCertainty (BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented)
float ComputeAdjustment (int num_problems, float penalty)
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
float ComputeConsistencyAdjustedRatingsSum (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
float ComputeAdjustedPathCost (float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
bool ProblematicPath (const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
void GetTopChoiceLowerUpper (LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper)
LanguageModelFlagsType AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void PrintViterbiStateEntry (const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)
void GenerateTopChoiceInfo (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed)
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed)
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed)
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info)
void UpdateBestChoice (BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void ExtractRawFeaturesFromPath (const ViterbiStateEntry &vse, float *features)
WERD_CHOICEConstructWord (BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
void UpdateCoveredByFixedLengthDawgs (const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done)
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
bool PrunablePath (LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
bool AcceptablePath (const ViterbiStateEntry &vse)

Protected Attributes

DawgArgsdawg_args_
GenericVector< bool * > updated_flags_
float rating_cert_scale_
const UnicityTable< FontInfo > * fontinfo_table_
Dictdict_
bool fixed_pitch_
float max_char_wh_ratio_
STRING prev_word_str_
int prev_word_unichar_step_len_
DawgInfoVectorbeginning_active_dawgs_
DawgInfoVectorbeginning_constraints_
DawgInfoVectorfixed_length_beginning_active_dawgs_
DawgInfoVectorempty_dawg_info_vec_
float max_penalty_adjust_
bool acceptable_choice_found_
bool correct_segmentation_explored_

Detailed Description

Definition at line 264 of file language_model.h.


Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 43 of file language_model.cpp.

: INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
dict->getImage()->getCCUtil()->params()),
"Turn on/off the use of character ngram model",
dict->getImage()->getCCUtil()->params()),
"Maximum order of the character ngram model",
dict->getImage()->getCCUtil()->params()),
"Maximum number of prunable (those for which"
" PrunablePath() is true) entries in each viterbi list"
" recorded in BLOB_CHOICEs",
dict->getImage()->getCCUtil()->params()),
"Maximum size of viterbi lists recorded in BLOB_CHOICEs",
dict->getImage()->getCCUtil()->params()),
"To avoid overly small denominators use this as the "
"floor of the probability returned by the ngram model.",
dict->getImage()->getCCUtil()->params()),
"Average classifier score of a non-matching unichar.",
dict->getImage()->getCCUtil()->params()),
"Use only the first UTF8 step of the given string"
" when computing log probabilities.",
dict->getImage()->getCCUtil()->params()),
"Strength of the character ngram model relative to the"
" character classifier ",
dict->getImage()->getCCUtil()->params()),
"Words are delimited by space",
dict->getImage()->getCCUtil()->params()),
"Minimum length of compound words",
dict->getImage()->getCCUtil()->params()),
"Depth of blob choice lists to explore"
" when fixed length dawgs are on",
dict->getImage()->getCCUtil()->params()),
"Penalty for words not in the frequent word dictionary",
dict->getImage()->getCCUtil()->params()),
"Penalty for non-dictionary words",
dict->getImage()->getCCUtil()->params()),
"Penalty for inconsistent punctuation",
dict->getImage()->getCCUtil()->params()),
"Penalty for inconsistent case",
dict->getImage()->getCCUtil()->params()),
"Penalty for inconsistent script",
dict->getImage()->getCCUtil()->params()),
"Penalty for inconsistent character type",
dict->getImage()->getCCUtil()->params()),
// TODO(daria, rays): enable font consistency checking
// after improving font analysis.
"Penalty for inconsistent font",
dict->getImage()->getCCUtil()->params()),
"Penalty for inconsistent spacing",
dict->getImage()->getCCUtil()->params()),
"Penalty increment",
dict->getImage()->getCCUtil()->params()),
"Use sigmoidal score for certainty",
dict->getImage()->getCCUtil()->params()),
fontinfo_table_(fontinfo_table), dict_(dict),
dawg_args_ = new DawgArgs(NULL, NULL, new DawgInfoVector(),
new DawgInfoVector(),
0.0, NO_PERM, kAnyWordLength, -1);
beginning_active_dawgs_ = new DawgInfoVector();
beginning_constraints_ = new DawgInfoVector();
fixed_length_beginning_active_dawgs_ = new DawgInfoVector();
empty_dawg_info_vec_ = new DawgInfoVector();
}
tesseract::LanguageModel::~LanguageModel ( )

Member Function Documentation

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 407 of file language_model.h.

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 731 of file language_model.h.

{
return (vse.dawg_info != NULL || vse.Consistent() ||
(vse.ngram_info != NULL && !vse.ngram_info->pruned));
}
LanguageModelFlagsType tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 450 of file language_model.cpp.

{
ViterbiStateEntry_IT vit;
tprintf("\nAddViterbiStateEntry for unichar %s rating=%.4f"
" certainty=%.4f top_choice_flags=0x%x parent_vse=%p\n",
b->rating(), b->certainty(), top_choice_flags, parent_vse);
tprintf("Existing viterbi list:\n");
vit.set_to_list(&(reinterpret_cast<LanguageModelState *>(
b->language_model_state())->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
PrintViterbiStateEntry("", vit.data(), b, chunks_record);
}
}
}
// Check whether the list is full.
if (b->language_model_state() != NULL &&
(reinterpret_cast<LanguageModelState *>(
b->language_model_state())->viterbi_state_entries_length >=
tprintf("AddViterbiStateEntry: viterbi list is full!\n");
}
return 0x0;
}
LanguageModelFlagsType changed = 0x0;
float optimistic_cost = 0.0f;
if (!language_model_ngram_on) optimistic_cost += b->rating();
if (parent_vse != NULL) optimistic_cost += parent_vse->cost;
// Discard this entry if it will not beat best choice.
if (optimistic_cost >= best_choice_bundle->best_choice->rating()) {
tprintf("Discarded ViterbiEntry with high cost %.4f"
" best_choice->rating()=%.4f\n", optimistic_cost,
best_choice_bundle->best_choice->rating());
}
return 0x0;
}
// Check consistency of the path and set the relevant consistency_info.
LanguageModelConsistencyInfo consistency_info;
FillConsistencyInfo(curr_col, word_end, b, parent_vse, parent_b,
chunks_record, &consistency_info);
// Invoke Dawg language model component.
LanguageModelDawgInfo *dawg_info =
GenerateDawgInfo(word_end, consistency_info.script_id,
curr_col, curr_row, *b, parent_vse, &changed);
// Invoke TopChoice language model component
float ratings_sum = b->rating();
if (parent_vse != NULL) ratings_sum += parent_vse->ratings_sum;
GenerateTopChoiceInfo(ratings_sum, dawg_info, consistency_info,
parent_vse, b, &top_choice_flags, &changed);
// Invoke Ngram language model component.
LanguageModelNgramInfo *ngram_info = NULL;
ngram_info = GenerateNgramInfo(
denom, curr_col, curr_row, parent_vse, parent_b, &changed);
ASSERT_HOST(ngram_info != NULL);
}
// Prune non-top choice paths with inconsistent scripts.
if (consistency_info.inconsistent_script) {
if (!(top_choice_flags & kSmallestRatingFlag)) changed = 0x0;
if (ngram_info != NULL) ngram_info->pruned = true;
}
// If language model components did not like this unichar - return
if (!changed) {
tprintf("Language model components did not like this entry\n");
}
delete dawg_info;
delete ngram_info;
return 0x0;
}
// Compute cost of associating the blobs that represent the current unichar.
AssociateStats associate_stats;
parent_vse, chunks_record, &associate_stats);
if (parent_vse != NULL) {
associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
}
// Compute the aggregate cost (adjusted ratings sum).
ratings_sum,
(parent_vse == NULL) ? 1 : (parent_vse->length+1),
(dawg_info == NULL) ? 0.0f : 1.0f,
dawg_info, ngram_info, consistency_info, associate_stats, parent_vse);
if (b->language_model_state() == NULL) {
b->set_language_model_state(new LanguageModelState(curr_col, curr_row));
}
LanguageModelState *lms =
reinterpret_cast<LanguageModelState *>(b->language_model_state());
// Discard this entry if it represents a prunable path and
// language_model_viterbi_list_max_num_prunable such entries with a lower
// cost have already been recorded.
if (PrunablePath(top_choice_flags, dawg_info) &&
(lms->viterbi_state_entries_prunable_length >=
cost >= lms->viterbi_state_entries_prunable_max_cost) {
tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
cost, lms->viterbi_state_entries_prunable_max_cost);
}
delete dawg_info;
delete ngram_info;
return 0x0;
}
// Create the new ViterbiStateEntry and add it to lms->viterbi_state_entries
ViterbiStateEntry *new_vse = new ViterbiStateEntry(
parent_b, parent_vse, b, cost, ComputeOutlineLength(b), consistency_info,
associate_stats, top_choice_flags, dawg_info, ngram_info);
updated_flags_.push_back(&(new_vse->updated));
lms->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
false, new_vse);
lms->viterbi_state_entries_length++;
if (PrunablePath(top_choice_flags, dawg_info)) {
lms->viterbi_state_entries_prunable_length++;
}
// Update lms->viterbi_state_entries_prunable_max_cost and clear
// top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
if ((lms->viterbi_state_entries_prunable_length >=
ASSERT_HOST(!lms->viterbi_state_entries.empty());
vit.set_to_list(&(lms->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
ViterbiStateEntry *curr_vse = vit.data();
// Clear the appropriate top choice flags of the entries in the
// list that have ratings_sum higher thank new_entry->ratings_sum
// (since they will not be top choices any more).
if (curr_vse->top_choice_flags && curr_vse != new_vse &&
curr_vse->ratings_sum, curr_vse->dawg_info,
curr_vse->consistency_info) >
new_vse->ratings_sum, new_vse->dawg_info,
new_vse->consistency_info)) {
curr_vse->top_choice_flags &= ~(top_choice_flags);
}
if (prunable_counter > 0 &&
PrunablePath(curr_vse->top_choice_flags, curr_vse->dawg_info)) {
--prunable_counter;
}
// Update lms->viterbi_state_entries_prunable_max_cost.
if (prunable_counter == 0) {
lms->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
tprintf("Set viterbi_state_entries_prunable_max_cost to %.4f\n",
lms->viterbi_state_entries_prunable_max_cost);
}
prunable_counter = -1; // stop counting
}
}
}
// Print the newly created ViterbiStateEntry.
PrintViterbiStateEntry("New", new_vse, b, chunks_record);
tprintf("Updated viterbi list (max cost %g):\n",
lms->viterbi_state_entries_prunable_max_cost);
vit.set_to_list(&(lms->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
PrintViterbiStateEntry("", vit.data(), b, chunks_record);
}
}
}
// Update best choice if needed.
if (word_end) {
UpdateBestChoice(b, new_vse, pain_points, chunks_record,
best_choice_bundle, blamer_bundle);
}
// Update stats in best_path_by_column.
if (new_vse->Consistent() || new_vse->dawg_info != NULL ||
(new_vse->ngram_info != NULL && !new_vse->ngram_info->pruned)) {
float avg_cost = new_vse->cost / static_cast<float>(curr_row+1);
for (int c = curr_col; c <= curr_row; ++c) {
if (avg_cost < (*best_path_by_column)[c].avg_cost) {
(*best_path_by_column)[c].avg_cost = avg_cost;
(*best_path_by_column)[c].best_vse = new_vse;
(*best_path_by_column)[c].best_b = b;
tprintf("Set best_path_by_column[%d]=(%g %p)\n",
c, avg_cost, new_vse);
}
}
}
}
return changed;
}
float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 440 of file language_model.h.

{
// cert is assumed to be between 0 and -dict_->certainty_scale.
// If you enable language_model_use_sigmoidal_certainty, you
// need to adjust language_model_ngram_nonmatch_score as well.
cert = -cert / dict_->certainty_scale;
return 1.0f / (1.0f + exp(10.0f * cert));
} else {
return (-1.0f / cert);
}
}
void tesseract::LanguageModel::CleanUp ( )

Definition at line 273 of file language_model.cpp.

{
for (int i = 0; i < updated_flags_.size(); ++i) {
*(updated_flags_[i]) = false;
}
}
float tesseract::LanguageModel::ComputeAdjustedPathCost ( float  ratings_sum,
int  length,
float  dawg_score,
const LanguageModelDawgInfo dawg_info,
const LanguageModelNgramInfo ngram_info,
const LanguageModelConsistencyInfo consistency_info,
const AssociateStats associate_stats,
ViterbiStateEntry parent_vse 
)
protected

Definition at line 1152 of file language_model.cpp.

{
float adjustment = 1.0f;
if (dawg_info == NULL || dawg_info->permuter != FREQ_DAWG_PERM) {
}
if (dawg_score == 0.0f) {
adjustment += ((length - language_model_min_compound_length) *
}
} else if (dawg_score < 1.0f) {
adjustment += (1.0f - dawg_score) * language_model_penalty_non_dict_word;
}
if (associate_stats.shape_cost > 0) {
adjustment += associate_stats.shape_cost / static_cast<float>(length);
}
ASSERT_HOST(ngram_info != NULL);
return ngram_info->ngram_cost * adjustment;
} else {
adjustment += ComputeConsistencyAdjustment(dawg_info, consistency_info);
return ratings_sum * adjustment;
}
}
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 485 of file language_model.h.

{
if (num_problems == 0) return 0.0f;
if (num_problems == 1) return penalty;
return (penalty + (language_model_penalty_increment *
static_cast<float>(num_problems-1)));
}
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 699 of file language_model.h.

{
col, row,
(parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
(parent_vse != NULL) ? parent_vse->length : 0,
fixed_pitch_, max_char_wh_ratio,
chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL,
chunks_record, language_model_debug_level, associate_stats);
}
float tesseract::LanguageModel::ComputeConsistencyAdjustedRatingsSum ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 518 of file language_model.h.

{
return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
dawg_info, consistency_info)));
}
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 496 of file language_model.h.

{
if (dawg_info != NULL) {
return ComputeAdjustment(consistency_info.NumInconsistentCase(),
}
return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
ComputeAdjustment(consistency_info.NumInconsistentCase(),
ComputeAdjustment(consistency_info.NumInconsistentChartype(),
ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
(consistency_info.inconsistent_script ?
(consistency_info.inconsistent_font ?
}
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 962 of file language_model.cpp.

{
if (curr_list->empty()) return 1.0f;
float denom = 0.0f;
int len = 0;
BLOB_CHOICE_IT c_it(curr_list);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
ASSERT_HOST(c_it.data() != NULL);
++len;
denom += CertaintyScore(c_it.data()->certainty());
}
assert(len != 0);
// The ideal situation would be to have the classifier scores for
// classifying each position as each of the characters in the unicharset.
// Since we can not do this because of speed, we add a very crude estimate
// of what these scores for the "missing" classifications would sum up to.
denom += (dict_->getUnicharset().size() - len) *
return denom;
}
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 905 of file language_model.cpp.

{
const char *context_ptr = context;
char *modified_context = NULL;
char *modified_context_end = NULL;
const char *unichar_ptr = unichar;
const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
float prob = 0.0f;
int step = 0;
while (unichar_ptr < unichar_end &&
(step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
}
prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
++(*unichar_step_len);
unichar_ptr += step;
// If there are multiple UTF8 characters present in unichar, context is
// updated to include the previously examined characters from str,
// unless use_only_first_uft8_step is true.
if (unichar_ptr < unichar_end) {
if (modified_context == NULL) {
int context_len = strlen(context);
modified_context =
new char[context_len + strlen(unichar_ptr) + step + 1];
strncpy(modified_context, context, context_len);
modified_context_end = modified_context + context_len;
context_ptr = modified_context;
}
strncpy(modified_context_end, unichar_ptr - step, step);
modified_context_end += step;
*modified_context_end = '\0';
}
}
prob /= static_cast<float>(*unichar_step_len); // normalize
if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
*found_small_prob = true;
}
*ngram_prob = -1.0*log(prob);
float cost = -1.0*log(CertaintyScore(certainty)/denom) +
tprintf("-log [ p(%s) * p(%s | %s) ] = -log(%g*%g) = %g\n", unichar,
unichar, context_ptr, CertaintyScore(certainty)/denom, prob, cost);
}
if (modified_context != NULL) delete[] modified_context;
return cost;
}
float tesseract::LanguageModel::ComputeOutlineLength ( BLOB_CHOICE b)
inline

Definition at line 434 of file language_model.h.

{
return rating_cert_scale_ * b->rating() / b->certainty();
}
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( BLOB_CHOICE b,
ViterbiStateEntry vse,
CHUNKS_RECORD chunks_record,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
float  certainties[],
float *  dawg_score,
STATE state,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1375 of file language_model.cpp.

{
uinT64 state_uint = 0x0;
if (truth_path != NULL) {
*truth_path =
(blamer_bundle != NULL &&
!blamer_bundle->correct_segmentation_cols.empty() &&
vse->length == blamer_bundle->correct_segmentation_cols.length());
}
const uinT64 kHighestBitOn = 0x8000000000000000LL;
BLOB_CHOICE *curr_b = b;
LanguageModelState *curr_lms =
reinterpret_cast<LanguageModelState *>(b->language_model_state());
ViterbiStateEntry *curr_vse = vse;
int i;
bool compound = dict_->hyphenated(); // treat hyphenated words as compound
bool dawg_score_done = true;
if (dawg_score != NULL) {
*dawg_score = 0.0f;
// For space-delimited languages the presence of dawg_info in the last
// ViterbyStateEntry on the path means that the whole path represents
// a valid dictionary word.
if (vse->dawg_info != NULL) *dawg_score = 1.0f;
} else if (vse->length == 1) {
*dawg_score = 1.0f; // each one-letter word is legal
dawg_score_done = true; // in non-space delimited languages
} else {
dawg_score_done = false; // do more work to compute dawg_score
}
}
// For non-space delimited languages we compute the fraction of letters
// "covered" by fixed length dawgs (i.e. words of length > 1 on the path).
int covered_by_fixed_length_dawgs = 0;
// The number of unichars remaining that should be skipped because
// they are covered by the previous word from fixed length dawgs.
int fixed_length_num_unichars_to_skip = 0;
// Re-compute the variance of the width-to-height ratios (since we now
// can compute the mean over the whole word).
float full_wh_ratio_mean = 0.0f;
if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
static_cast<float>(vse->length));
vse->associate_stats.full_wh_ratio_var = 0.0f;
}
// Construct a WERD_CHOICE by tracing parent pointers.
WERD_CHOICE *word = new WERD_CHOICE(chunks_record->word_res->uch_set,
vse->length);
word->set_length(vse->length);
for (i = (vse->length-1); i >= 0; --i) {
if (blamer_bundle != NULL && truth_path != NULL && *truth_path) {
if ((blamer_bundle->correct_segmentation_cols[i] !=
curr_lms->contained_in_col) ||
(blamer_bundle->correct_segmentation_rows[i] !=
curr_lms->contained_in_row)) {
*truth_path = false;
}
}
word->set_unichar_id(curr_b->unichar_id(), i);
word->set_fragment_length(1, i);
if (certainties != NULL) certainties[i] = curr_b->certainty();
if (best_char_choices != NULL) {
best_char_choices->set(chunks_record->ratings->get(
curr_lms->contained_in_col, curr_lms->contained_in_row), i);
}
if (state != NULL) {
// Record row minus col zeroes in the reverse state to mark the number
// of joins done by using a blob from this cell in the ratings matrix.
state_uint >>= (curr_lms->contained_in_row - curr_lms->contained_in_col);
// Record a one in the reverse state to indicate the split before
// the blob from the next cell in the ratings matrix (unless we are
// at the first cell, in which case there is no next blob).
if (i != 0) {
state_uint >>= 1;
state_uint |= kHighestBitOn;
}
}
// For non-space delimited languages: find word endings recorded while
// trying to separate the current path into words (for words found in
// fixed length dawgs.
if (!dawg_score_done && curr_vse->dawg_info != NULL) {
UpdateCoveredByFixedLengthDawgs(*(curr_vse->dawg_info->active_dawgs),
i, vse->length,
&fixed_length_num_unichars_to_skip,
&covered_by_fixed_length_dawgs,
dawg_score, &dawg_score_done);
}
// Update the width-to-height ratio variance. Useful non-space delimited
// languages to ensure that the blobs are of uniform width.
// Skip leading and trailing punctuation when computing the variance.
if ((full_wh_ratio_mean != 0.0f &&
((curr_vse != vse && curr_vse->parent_vse != NULL) ||
vse->associate_stats.full_wh_ratio_var +=
pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
tprintf("full_wh_ratio_var += (%g-%g)^2\n",
full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
}
}
// Mark the word as compound if compound permuter was set for any of
// the unichars on the path (usually this will happen for unichars
// that are compounding operators, like "-" and "/").
if (!compound && curr_vse->dawg_info &&
curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
// Update curr_* pointers.
if (curr_vse->parent_b == NULL) break;
curr_b = curr_vse->parent_b;
curr_lms =
reinterpret_cast<LanguageModelState *>(curr_b->language_model_state());
curr_vse = curr_vse->parent_vse;
}
ASSERT_HOST(i == 0); // check that we recorded all the unichar ids
// Re-adjust shape cost to include the updated width-to-height variance.
if (full_wh_ratio_mean != 0.0f) {
vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
}
if (state != NULL) {
state_uint >>= (64 - (chunks_record->ratings->dimension()-1));
state->part2 = state_uint;
state_uint >>= 32;
state->part1 = state_uint;
}
word->set_rating(vse->ratings_sum);
word->set_certainty(vse->min_certainty);
if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
} else if (language_model_ngram_on && !vse->ngram_info->pruned) {
word->set_permuter(NGRAM_PERM);
} else if (vse->top_choice_flags) {
word->set_permuter(TOP_CHOICE_PERM);
} else {
word->set_permuter(NO_PERM);
}
return word;
}
void tesseract::LanguageModel::DeleteState ( BLOB_CHOICE_LIST *  choices)

Definition at line 280 of file language_model.cpp.

{
BLOB_CHOICE_IT b_it(choices);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
if (b_it.data()->language_model_state() != NULL) {
LanguageModelState *state = reinterpret_cast<LanguageModelState *>(
b_it.data()->language_model_state());
delete state;
b_it.data()->set_language_model_state(NULL);
}
}
}
void tesseract::LanguageModel::ExtractRawFeaturesFromPath ( const ViterbiStateEntry vse,
float *  features 
)
protected

Definition at line 1330 of file language_model.cpp.

{
for (int f = 0; f < PTRAIN_NUM_RAW_FEATURE_TYPES; ++f) features[f] = 0.0;
// Dictionary-related features.
if (vse.dawg_info != NULL) {
features[PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE] = vse.dawg_info->permuter;
// Mark as unambiguous if unambig_dawg is found among active dawgs.
for (int d = 0; d < vse.dawg_info->active_dawgs->size(); ++d) {
if (dict_->GetDawg(vse.dawg_info->active_dawgs->get(d).dawg_index) ==
break;
}
}
}
if (vse.associate_stats.shape_cost > 0) {
features[PTRAIN_RAW_FEATURE_SHAPE_COST] = vse.associate_stats.shape_cost;
}
ASSERT_HOST(vse.ngram_info != NULL);
features[PTRAIN_RAW_FEATURE_NGRAM_PROB] = vse.ngram_info->ngram_prob;
}
// Consistency-related features.
vse.consistency_info.NumInconsistentPunc();
vse.consistency_info.NumInconsistentCase();
vse.consistency_info.NumInconsistentChartype();
vse.consistency_info.NumInconsistentSpaces();
vse.consistency_info.inconsistent_script;
vse.consistency_info.inconsistent_font;
// Classifier-related features.
features[PTRAIN_RAW_FEATURE_WORST_CERT] = vse.min_certainty;
features[PTRAIN_RAW_FEATURE_RATING] = vse.ratings_sum;
features[PTRAIN_RAW_FEATURE_ADAPTED] = vse.adapted;
// Normalization-related features.
features[PTRAIN_RAW_FEATURE_NUM_UNICHARS] = vse.length;
features[PTRAIN_RAW_FEATURE_OUTLINE_LEN] = vse.outline_length;
}
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
CHUNKS_RECORD chunks_record,
LanguageModelConsistencyInfo consistency_info 
)
protected

Definition at line 983 of file language_model.cpp.

{
const UNICHARSET &unicharset = dict_->getUnicharset();
UNICHAR_ID unichar_id = b->unichar_id();
if (parent_vse != NULL) *consistency_info = parent_vse->consistency_info;
// Check punctuation validity.
if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
(unicharset.get_isalpha(parent_b->unichar_id()) ||
unicharset.get_isdigit(parent_b->unichar_id()))) {
// reset punc_ref for compound words
consistency_info->punc_ref = NO_EDGE;
} else {
UNICHAR_ID pattern_unichar_id =
(unicharset.get_isalpha(unichar_id) ||
unicharset.get_isdigit(unichar_id)) ?
if (consistency_info->punc_ref == NO_EDGE ||
pattern_unichar_id != Dawg::kPatternUnicharID ||
dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
consistency_info->punc_ref);
consistency_info->punc_ref =
(node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
node, pattern_unichar_id, word_end) : NO_EDGE;
if (consistency_info->punc_ref == NO_EDGE) {
consistency_info->invalid_punc = true;
}
}
}
}
// Update case related counters.
if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
// Reset counters if we are dealing with a compound word.
consistency_info->num_lower = 0;
consistency_info->num_non_first_upper = 0;
}
else if (unicharset.get_islower(unichar_id)) {
consistency_info->num_lower++;
} else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
if (unicharset.get_isupper(parent_b->unichar_id()) ||
consistency_info->num_lower > 0 ||
consistency_info->num_non_first_upper > 0) {
consistency_info->num_non_first_upper++;
}
}
// Initialize consistency_info->script_id (use script of unichar_id
// if it is not Common, use script id recorded by the parent otherwise).
// Set inconsistent_script to true if the script of the current unichar
// is not consistent with that of the parent.
consistency_info->script_id = unicharset.get_script(unichar_id);
// Hiragana and Katakana can mix with Han.
if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
consistency_info->script_id == unicharset.hiragana_sid()) ||
(unicharset.katakana_sid() != unicharset.null_sid() &&
consistency_info->script_id == unicharset.katakana_sid())) {
consistency_info->script_id = dict_->getUnicharset().han_sid();
}
}
if (parent_vse != NULL &&
(parent_vse->consistency_info.script_id !=
int parent_script_id = parent_vse->consistency_info.script_id;
// If script_id is Common, use script id of the parent instead.
if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
consistency_info->script_id = parent_script_id;
}
if (consistency_info->script_id != parent_script_id) {
consistency_info->inconsistent_script = true;
}
}
// Update chartype related counters.
if (unicharset.get_isalpha(unichar_id)) {
consistency_info->num_alphas++;
} else if (unicharset.get_isdigit(unichar_id)) {
consistency_info->num_digits++;
} else if (!unicharset.get_ispunctuation(unichar_id)) {
consistency_info->num_other++;
}
// Check font and spacing consistency.
if (parent_b != NULL) {
int fontinfo_id = -1;
if (parent_b->fontinfo_id() == b->fontinfo_id() ||
parent_b->fontinfo_id2() == b->fontinfo_id()) {
fontinfo_id = b->fontinfo_id();
} else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
parent_b->fontinfo_id2() == b->fontinfo_id2()) {
fontinfo_id = b->fontinfo_id2();
}
tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
(parent_b->fontinfo_id() >= 0) ?
fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
(parent_b->fontinfo_id2() >= 0) ?
fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
(b->fontinfo_id() >= 0) ?
fontinfo_table_->get(b->fontinfo_id()).name : "",
(fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
(fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
fontinfo_id);
}
bool expected_gap_found = false;
float expected_gap;
int temp_gap;
if (fontinfo_id >= 0) { // found a common font
if (fontinfo_table_->get(fontinfo_id).get_spacing(
parent_b->unichar_id(), unichar_id, &temp_gap)) {
expected_gap = temp_gap;
expected_gap_found = true;
}
} else {
consistency_info->inconsistent_font = true;
// Get an average of the expected gaps in each font
int num_addends = 0;
expected_gap = 0;
int temp_fid;
for (int i = 0; i < 4; ++i) {
if (i == 0) {
temp_fid = parent_b->fontinfo_id();
} else if (i == 1) {
temp_fid = parent_b->fontinfo_id2();
} else if (i == 2) {
temp_fid = b->fontinfo_id();
} else {
temp_fid = b->fontinfo_id2();
}
if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
parent_b->unichar_id(), unichar_id, &temp_gap)) {
expected_gap += temp_gap;
num_addends++;
}
}
expected_gap_found = (num_addends > 0);
if (num_addends > 0) expected_gap /= static_cast<float>(num_addends);
}
if (expected_gap_found) {
float actual_gap =
static_cast<float>(AssociateUtils::GetChunksGap(
chunks_record->chunk_widths, curr_col-1));
float gap_ratio = expected_gap / actual_gap;
// TODO(daria): find a good way to tune this heuristic estimate.
if (gap_ratio < 1/2 || gap_ratio > 2) {
consistency_info->num_inconsistent_spaces++;
}
tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
unicharset.id_to_unichar(parent_b->unichar_id()),
parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
unichar_id, curr_col, expected_gap, actual_gap);
}
}
}
}
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  script_id,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse,
LanguageModelFlagsType changed 
)
protected

Definition at line 740 of file language_model.cpp.

{
bool use_fixed_length_dawgs = !fixed_length_beginning_active_dawgs_->empty();
// Initialize active_dawgs and constraints from parent_vse if it is not NULL,
// otherwise use beginning_active_dawgs_ and beginning_constraints_.
if (parent_vse == NULL ||
(use_fixed_length_dawgs && parent_vse->dawg_info == NULL)) {
dawg_args_->permuter = NO_PERM;
} else {
if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
dawg_args_->constraints = parent_vse->dawg_info->constraints;
dawg_args_->permuter = parent_vse->dawg_info->permuter;
}
// Deal with hyphenated words.
if (!use_fixed_length_dawgs && word_end &&
dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
*changed |= kDawgFlag;
return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
COMPOUND_PERM);
}
// Deal with compound words.
if (!use_fixed_length_dawgs && dict_->compound_marker(b.unichar_id()) &&
(parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
if (language_model_debug_level > 0) tprintf("Found compound marker");
// Do not allow compound operators at the beginning and end of the word.
// Do not allow more than one compound operator per word.
// Do not allow compounding of words with lengths shorter than
// language_model_min_compound_length
if (parent_vse == NULL || word_end ||
dawg_args_->permuter == COMPOUND_PERM ||
parent_vse->length < language_model_min_compound_length) return NULL;
int i;
// Check a that the path terminated before the current character is a word.
bool has_word_ending = false;
for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
const DawgInfo &info = (*parent_vse->dawg_info->active_dawgs)[i];
const Dawg *pdawg = dict_->GetDawg(info.dawg_index);
assert(pdawg != NULL);
if (pdawg->type() == DAWG_TYPE_WORD && info.ref != NO_EDGE &&
pdawg->end_of_word(info.ref)) {
has_word_ending = true;
break;
}
}
if (!has_word_ending) return NULL;
// Return LanguageModelDawgInfo with active_dawgs set to
// the beginning dawgs of type DAWG_TYPE_WORD.
if (language_model_debug_level > 0) tprintf("Compound word found\n");
DawgInfoVector beginning_word_dawgs;
for (i = 0; i < beginning_active_dawgs_->size(); ++i) {
const Dawg *bdawg =
if (bdawg->type() == DAWG_TYPE_WORD) {
beginning_word_dawgs += (*beginning_active_dawgs_)[i];
}
}
*changed |= kDawgFlag;
return new LanguageModelDawgInfo(&(beginning_word_dawgs),
COMPOUND_PERM);
} // done dealing with compound words
LanguageModelDawgInfo *dawg_info = NULL;
// Call LetterIsOkay().
if (dawg_args_->permuter != NO_PERM) {
*changed |= kDawgFlag;
dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_active_dawgs,
}
// For non-space delimited languages: since every letter could be
// a valid word, a new word could start at every unichar. Thus append
// fixed_length_beginning_active_dawgs_ to dawg_info->active_dawgs.
if (use_fixed_length_dawgs) {
if (dawg_info == NULL) {
*changed |= kDawgFlag;
dawg_info = new LanguageModelDawgInfo(
empty_dawg_info_vec_, SYSTEM_DAWG_PERM);
} else {
*(dawg_info->active_dawgs) += *(fixed_length_beginning_active_dawgs_);
}
} // done dealing with fixed-length dawgs
return dawg_info;
}
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
LanguageModelFlagsType changed 
)
protected

Definition at line 844 of file language_model.cpp.

{
// Initialize parent context.
const char *pcontext_ptr = "";
int pcontext_unichar_step_len = 0;
if (parent_vse == NULL) {
pcontext_ptr = prev_word_str_.string();
pcontext_unichar_step_len = prev_word_unichar_step_len_;
} else {
pcontext_ptr = parent_vse->ngram_info->context.string();
pcontext_unichar_step_len =
parent_vse->ngram_info->context_unichar_step_len;
}
// Compute p(unichar | parent context).
int unichar_step_len = 0;
bool pruned = false;
float ngram_prob;
float ngram_cost = ComputeNgramCost(unichar, certainty, denom,
pcontext_ptr, &unichar_step_len,
&pruned, &ngram_prob);
// First attempt to normalize ngram_cost for strings of different
// lengths - we multiply ngram_cost by P(char | context) as many times
// as the number of chunks occupied by char. This makes the ngram costs
// of all the paths ending at the current BLOB_CHOICE comparable.
// TODO(daria): it would be worth looking at different ways of normalization.
if (curr_row > curr_col) {
ngram_cost += (curr_row - curr_col) * ngram_cost;
ngram_prob += (curr_row - curr_col) * ngram_prob;
}
// Add the ngram_cost of the parent.
if (parent_vse != NULL) {
ngram_cost += parent_vse->ngram_info->ngram_cost;
ngram_prob += parent_vse->ngram_info->ngram_prob;
}
// Shorten parent context string by unichar_step_len unichars.
int num_remove = (unichar_step_len + pcontext_unichar_step_len -
if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
while (num_remove > 0 && *pcontext_ptr != '\0') {
pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
--num_remove;
}
// Decide whether to prune this ngram path and update changed accordingly.
if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
if (!pruned) *changed |= kNgramFlag;
// Construct and return the new LanguageModelNgramInfo.
LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_prob, ngram_cost);
ngram_info->context += unichar;
ngram_info->context_unichar_step_len += unichar_step_len;
assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
return ngram_info;
}
void tesseract::LanguageModel::GenerateNgramModelPainPointsFromColumn ( int  col,
int  row,
HEAP pain_points,
CHUNKS_RECORD chunks_record 
)

Definition at line 1604 of file language_model.cpp.

{
// Find the first top choice path recorded for this cell.
// If this path is pruned - generate a pain point.
ASSERT_HOST(chunks_record->ratings->get(col, row) != NULL);
BLOB_CHOICE_IT bit(chunks_record->ratings->get(col, row));
bool fragmented = false;
for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
if (dict_->getUnicharset().get_fragment(bit.data()->unichar_id())) {
fragmented = true;
continue;
}
LanguageModelState *lms = reinterpret_cast<LanguageModelState *>(
bit.data()->language_model_state());
if (lms == NULL) continue;
ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
const ViterbiStateEntry *vse = vit.data();
if (!vse->top_choice_flags) continue;
ASSERT_HOST(vse->ngram_info != NULL);
if (vse->ngram_info->pruned && (vse->parent_vse == NULL ||
!vse->parent_vse->ngram_info->pruned)) {
if (vse->parent_vse != NULL) {
LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
vse->parent_b->language_model_state());
GeneratePainPoint(pp_lms->contained_in_col, row, false,
-1.0f, fragmented, -1.0f,
vse->parent_vse->parent_b,
vse->parent_vse->parent_vse,
chunks_record, pain_points);
}
if (vse->parent_vse != NULL &&
vse->parent_vse->parent_vse != NULL &&
vse->parent_b->unichar_id())) {
// If the dip in the ngram probability is due to punctuation in the
// middle of the word - go two unichars back to combine this
// punctuation mark with the previous character on the path.
LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
vse->parent_vse->parent_b->language_model_state());
GeneratePainPoint(pp_lms->contained_in_col, col-1, false,
-1.0f, fragmented, -1.0f,
vse->parent_vse->parent_vse->parent_b,
vse->parent_vse->parent_vse->parent_vse,
chunks_record, pain_points);
} else if (row+1 < chunks_record->ratings->dimension()) {
GeneratePainPoint(col, row+1, true,
-1.0f, fragmented, -1.0f,
vse->parent_b,
vse->parent_vse,
chunks_record, pain_points);
}
}
return; // examined the lowest cost top choice path
}
}
}
bool tesseract::LanguageModel::GeneratePainPoint ( int  col,
int  row,
bool  ok_to_extend,
float  priority_adjustment,
float  worst_piece_cert,
bool  fragmented,
float  best_choice_cert,
float  max_char_wh_ratio,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
HEAP pain_points 
)

Definition at line 1923 of file language_model.cpp.

{
if (col < 0 || row >= chunks_record->ratings->dimension() ||
chunks_record->ratings->get(col, row) != NOT_CLASSIFIED) {
return false;
}
tprintf("\nGenerating pain point for col=%d row=%d priority=%g parent=",
col, row, priority);
if (parent_vse != NULL) {
PrintViterbiStateEntry("", parent_vse, parent_b, chunks_record);
} else {
tprintf("NULL");
}
tprintf("\n");
}
AssociateStats associate_stats;
ComputeAssociateStats(col, row, max_char_wh_ratio, parent_vse,
chunks_record, &associate_stats);
// For fixed-pitch fonts/languages: if the current combined blob overlaps
// the next blob on the right and it is ok to extend the blob, try expending
// the blob until there is no overlap with the next blob on the right or
// until the width-to-height ratio becomes too large.
if (ok_to_extend) {
while (associate_stats.bad_fixed_pitch_right_gap &&
row+1 < chunks_record->ratings->dimension() &&
!associate_stats.bad_fixed_pitch_wh_ratio) {
ComputeAssociateStats(col, ++row, max_char_wh_ratio, parent_vse,
chunks_record, &associate_stats);
}
}
if (associate_stats.bad_shape) {
tprintf("Discarded pain point with a bad shape\n");
}
return false;
}
// Compute pain point priority.
if (associate_stats.shape_cost > 0) {
priority *= associate_stats.shape_cost;
}
if (worst_piece_cert < best_choice_cert) {
worst_piece_cert = best_choice_cert;
}
priority *= CertaintyScore(worst_piece_cert);
if (fragmented) priority /= kDefaultPainPointPriorityAdjustment;
tprintf("worst_piece_cert=%g fragmented=%d\n",
worst_piece_cert, fragmented);
}
if (parent_vse != NULL) {
priority *= sqrt(parent_vse->cost / static_cast<float>(col));
if (parent_vse->dawg_info != NULL) {
if (parent_vse->length > language_model_min_compound_length) {
priority /= sqrt(static_cast<double>(parent_vse->length));
}
}
}
MATRIX_COORD *pain_point = new MATRIX_COORD(col, row);
if (HeapPushCheckSize(pain_points, priority, pain_point)) {
tprintf("Added pain point with priority %g\n", priority);
}
return true;
} else {
delete pain_point;
if (language_model_debug_level) tprintf("Pain points heap is full\n");
return false;
}
}
void tesseract::LanguageModel::GeneratePainPointsFromBestChoice ( HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle 
)

Definition at line 1752 of file language_model.cpp.

{
// Variables to backtrack best_vse path;
ViterbiStateEntry *curr_vse = best_choice_bundle->best_vse;
BLOB_CHOICE *curr_b = best_choice_bundle->best_b;
// Begins and ends in DANGERR vector record the positions in the blob choice
// list of the best choice. We need to translate these endpoints into the
// beginning column and ending row for the pain points. We maintain
// danger_begin and danger_end arrays indexed by the position in
// best_choice_bundle->best_char_choices (which is equal to the position
// on the best_choice_bundle->best_vse path).
// danger_end[d] stores the DANGERR_INFO structs with end==d and is
// initialized at the beginning of this function.
// danger_begin[d] stores the DANGERR_INFO struct with begin==d and
// has end set to the row of the end of this ambiguity.
// The translation from end in terms of the best choice index to the end row
// is done while following the parents of best_choice_bundle->best_vse.
assert(best_choice_bundle->best_char_choices->length() ==
best_choice_bundle->best_vse->length);
DANGERR *danger_begin = NULL;
DANGERR *danger_end = NULL;
int d;
if (!best_choice_bundle->fixpt.empty()) {
danger_begin = new DANGERR[best_choice_bundle->best_vse->length];
danger_end = new DANGERR[best_choice_bundle->best_vse->length];
for (d = 0; d < best_choice_bundle->fixpt.size(); ++d) {
const DANGERR_INFO &danger = best_choice_bundle->fixpt[d];
// Only use n->1 ambiguities.
if (danger.end > danger.begin && !danger.correct_is_ngram &&
danger_end[danger.end].push_back(danger);
}
}
}
// Variables to keep track of punctuation/number streaks.
int punc_streak_end_row = -1;
int punc_streak_length = 0;
float punc_streak_min_cert = 0.0f;
tprintf("\nGenerating pain points for best path=%p\n", curr_vse);
}
int word_index = best_choice_bundle->best_vse->length;
while (curr_vse != NULL) {
word_index--;
ASSERT_HOST(word_index >= 0);
ASSERT_HOST(curr_b != NULL);
tprintf("Looking at unichar %s\n",
}
int pp_col = reinterpret_cast<LanguageModelState *>(
curr_b->language_model_state())->contained_in_col;
int pp_row = reinterpret_cast<LanguageModelState *>(
curr_b->language_model_state())->contained_in_row;
// Generate pain points for ambiguities found by NoDangerousAmbig().
if (danger_end != NULL) {
// Translate end index of an ambiguity to an end row.
for (d = 0; d < danger_end[word_index].size(); ++d) {
danger_end[word_index][d].end = pp_row;
danger_begin[danger_end[word_index][d].begin].push_back(
danger_end[word_index][d]);
}
// Generate a pain point to combine unchars in the "wrong" part
// of the ambiguity.
for (d = 0; d < danger_begin[word_index].size(); ++d) {
tprintf("Generating pain point from %sambiguity\n",
danger_begin[word_index][d].dangerous ? "dangerous " : "");
}
GeneratePainPoint(pp_col, danger_begin[word_index][d].end, false,
danger_begin[word_index][d].dangerous ?
best_choice_bundle->best_choice->certainty(), true,
best_choice_bundle->best_choice->certainty(),
curr_vse->parent_b, curr_vse->parent_vse,
chunks_record, pain_points);
}
}
if (!language_model_ngram_on) { // no need to use further heuristics if we
// are guided by the character ngram model
// Generate pain points for problematic sub-paths.
if (ProblematicPath(*curr_vse, curr_b->unichar_id(),
pp_row+1 == chunks_record->ratings->dimension())) {
tprintf("Generating pain point from a problematic sub-path\n");
}
float worst_piece_cert;
bool fragment;
if (pp_col > 0) {
GetWorstPieceCertainty(pp_col-1, pp_row, chunks_record->ratings,
&worst_piece_cert, &fragment);
GeneratePainPoint(pp_col-1, pp_row, false,
worst_piece_cert, true,
best_choice_bundle->best_choice->certainty(),
chunks_record, pain_points);
}
if (pp_row+1 < chunks_record->ratings->dimension()) {
GetWorstPieceCertainty(pp_col, pp_row+1, chunks_record->ratings,
&worst_piece_cert, &fragment);
GeneratePainPoint(pp_col, pp_row+1, true,
worst_piece_cert, true,
best_choice_bundle->best_choice->certainty(),
chunks_record, pain_points);
}
}
// Generate a pain point if we encountered a punctuation/number streak to
// combine all punctuation marks into one blob and try to classify it.
bool is_alpha = dict_->getUnicharset().get_isalpha(curr_b->unichar_id());
if (!is_alpha) {
if (punc_streak_end_row == -1) punc_streak_end_row = pp_row;
punc_streak_length++;
if (curr_b->certainty() < punc_streak_min_cert)
punc_streak_min_cert = curr_b->certainty();
}
if (is_alpha || curr_vse->parent_vse == NULL) {
if (punc_streak_end_row != -1 && punc_streak_length > 1) {
tprintf("Generating pain point from a punctuation streak\n");
}
if (is_alpha ||
(curr_vse->parent_vse == NULL && punc_streak_length > 2)) {
GeneratePainPoint(pp_row+1, punc_streak_end_row, false,
punc_streak_min_cert, true,
best_choice_bundle->best_choice->certainty(),
max_char_wh_ratio_, curr_b, curr_vse,
chunks_record, pain_points);
}
// Handle punctuation/number streaks starting from the first unichar.
if (curr_vse->parent_vse == NULL) {
GeneratePainPoint(0, punc_streak_end_row, false,
punc_streak_min_cert, true,
best_choice_bundle->best_choice->certainty(),
chunks_record, pain_points);
}
}
punc_streak_end_row = -1;
punc_streak_length = 0;
punc_streak_min_cert = 0.0f;
} // end handling punctuation streaks
}
curr_b = curr_vse->parent_b;
curr_vse = curr_vse->parent_vse;
} // end looking at best choice subpaths
// Clean up.
if (danger_end != NULL) {
delete[] danger_begin;
delete[] danger_end;
}
}
void tesseract::LanguageModel::GeneratePainPointsFromColumn ( int  col,
const GenericVector< int > &  non_empty_rows,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)

Definition at line 1581 of file language_model.cpp.

{
for (int i = 0; i < non_empty_rows.length(); ++i) {
int row = non_empty_rows[i];
tprintf("\nLooking for pain points in col=%d row=%d\n", col, row);
}
col, row, pain_points, chunks_record);
} else {
col, row, best_choice_cert, pain_points,
best_path_by_column, chunks_record);
}
}
}
void tesseract::LanguageModel::GenerateProblematicPathPainPointsFromColumn ( int  col,
int  row,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)

Definition at line 1668 of file language_model.cpp.

{
MATRIX *ratings = chunks_record->ratings;
// Get the best path from this matrix cell.
BLOB_CHOICE_LIST *blist = ratings->get(col, row);
ASSERT_HOST(blist != NULL);
if (blist->empty()) return;
BLOB_CHOICE_IT bit(blist);
bool fragment = false;
while (dict_->getUnicharset().get_fragment(bit.data()->unichar_id()) &&
!bit.at_last()) { // skip fragments
fragment = true;
bit.forward();
}
if (bit.data()->language_model_state() == NULL) return;
ViterbiStateEntry_IT vit(&(reinterpret_cast<LanguageModelState *>(
bit.data()->language_model_state())->viterbi_state_entries));
if (vit.empty()) return;
ViterbiStateEntry *vse = vit.data();
// Check whether this path is promising.
bool path_is_promising = true;
if (vse->parent_vse != NULL) {
float potential_avg_cost =
((vse->parent_vse->cost + bit.data()->rating()*0.5f) /
static_cast<float>(row+1));
tprintf("potential_avg_cost %g best cost %g\n",
potential_avg_cost, (*best_path_by_column)[col].avg_cost);
}
if (potential_avg_cost >= (*best_path_by_column)[col].avg_cost) {
path_is_promising = false;
}
}
// Set best_parent_vse to the best parent recorded in best_path_by_column.
ViterbiStateEntry *best_parent_vse = vse->parent_vse;
BLOB_CHOICE *best_parent_b = vse->parent_b;
if (col > 0 && (*best_path_by_column)[col-1].best_vse != NULL) {
ASSERT_HOST((*best_path_by_column)[col-1].best_b != NULL);
LanguageModelState *best_lms = reinterpret_cast<LanguageModelState *>(
((*best_path_by_column)[col-1].best_b)->language_model_state());
if (best_lms->contained_in_row == col-1) {
best_parent_vse = (*best_path_by_column)[col-1].best_vse;
best_parent_b = (*best_path_by_column)[col-1].best_b;
tprintf("Setting best_parent_vse to %p\n", best_parent_vse);
}
}
}
// Check whether this entry terminates the best parent path
// recorded in best_path_by_column.
bool best_not_prolonged = (best_parent_vse != vse->parent_vse);
// If this path is problematic because of the last unichar - generate
// a pain point to combine it with its left and right neighbor.
BLOB_CHOICE_IT tmp_bit;
if (best_not_prolonged ||
(path_is_promising &&
ProblematicPath(*vse, bit.data()->unichar_id(),
row+1 == ratings->dimension()))) {
float worst_piece_cert;
bool fragmented;
if (col-1 > 0) {
GetWorstPieceCertainty(col-1, row, chunks_record->ratings,
&worst_piece_cert, &fragmented);
GeneratePainPoint(col-1, row, false,
worst_piece_cert, fragmented, best_choice_cert,
max_char_wh_ratio_, best_parent_b, best_parent_vse,
chunks_record, pain_points);
}
if (row+1 < ratings->dimension()) {
GetWorstPieceCertainty(col, row+1, chunks_record->ratings,
&worst_piece_cert, &fragmented);
worst_piece_cert, fragmented, best_choice_cert,
max_char_wh_ratio_, best_parent_b, best_parent_vse,
chunks_record, pain_points);
}
} // for ProblematicPath
}
void tesseract::LanguageModel::GenerateTopChoiceInfo ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE b,
LanguageModelFlagsType top_choice_flags,
LanguageModelFlagsType changed 
)
protected

Definition at line 705 of file language_model.cpp.

{
ratings_sum, dawg_info, consistency_info);
// Clear flags that do not agree with parent_vse->top_choice_flags.
if (parent_vse != NULL) *top_choice_flags &= parent_vse->top_choice_flags;
if (consistency_info.Consistent()) *top_choice_flags |= kConsistentFlag;
if (*top_choice_flags == 0x0) return;
LanguageModelState *lms =
reinterpret_cast<LanguageModelState *>(b->language_model_state());
if (lms != NULL && !lms->viterbi_state_entries.empty()) {
ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
vit.data()->ratings_sum, vit.data()->dawg_info,
vit.data()->consistency_info)) {
// Clear the appropriate flags if the list already contains
// a top choice entry with a lower cost.
*top_choice_flags &= ~(vit.data()->top_choice_flags);
}
}
}
tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
*top_choice_flags);
}
*changed |= *top_choice_flags;
}
void tesseract::LanguageModel::GetPieceCertainty ( BLOB_CHOICE_LIST *  blist,
float *  cert,
bool *  fragmented 
)
inlineprotected

Definition at line 472 of file language_model.h.

{
if (blist == NOT_CLASSIFIED || blist->empty()) return;
BLOB_CHOICE_IT bit(blist);
while (!bit.at_last() && IsFragment(bit.data())) {
*fragmented = true;
bit.forward(); // skip fragments
}
// Each classification must have at least one non-fragmented choice.
ASSERT_HOST(!IsFragment(bit.data()));
if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
}
void tesseract::LanguageModel::GetTopChoiceLowerUpper ( LanguageModelFlagsType  changed,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper 
)
protected

Definition at line 426 of file language_model.cpp.

{
if (!(changed & kLowerCaseFlag || changed & kUpperCaseFlag)) return;
BLOB_CHOICE_IT c_it(curr_list);
const UNICHARSET &unicharset = dict_->getUnicharset();
BLOB_CHOICE *first_unichar = NULL;
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
UNICHAR_ID unichar_id = c_it.data()->unichar_id();
if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
if (first_unichar == NULL) first_unichar = c_it.data();
if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
*first_lower = c_it.data();
}
if (*first_upper == NULL && unicharset.get_isupper(unichar_id)) {
*first_upper = c_it.data();
}
}
ASSERT_HOST(first_unichar != NULL);
if (*first_lower == NULL) *first_lower = first_unichar;
if (*first_upper == NULL) *first_upper = first_unichar;
}
void tesseract::LanguageModel::GetWorstPieceCertainty ( int  col,
int  row,
MATRIX ratings,
float *  cert,
bool *  fragmented 
)
inline

Definition at line 413 of file language_model.h.

{
*cert = 0.0f;
*fragmented = false;
if (row > 0) {
GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
}
if (col+1 < ratings->dimension()) {
GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
}
ASSERT_HOST(*cert < 0.0f);
}
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  best_choice_cert,
float  max_char_wh_ratio,
float  rating_cert_scale,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BlamerBundle blamer_bundle,
bool  debug_blamer 
)

Definition at line 141 of file language_model.cpp.

{
fixed_pitch_ = fixed_pitch;
max_char_wh_ratio_ = max_char_wh_ratio;
rating_cert_scale_ = rating_cert_scale;
// For each cell, generate a "pain point" if the cell is not classified
// and has a left or right neighbor that was classified.
MATRIX *ratings = chunks_record->ratings;
for (int col = 0; col < ratings->dimension(); ++col) {
for (int row = col+1; row < ratings->dimension(); ++row) {
if ((row > 0 && ratings->get(col, row-1) != NOT_CLASSIFIED) ||
(col+1 < ratings->dimension() &&
ratings->get(col+1, row) != NOT_CLASSIFIED)) {
float worst_piece_cert;
bool fragmented;
GetWorstPieceCertainty(col, row, chunks_record->ratings,
&worst_piece_cert, &fragmented);
worst_piece_cert, fragmented, best_choice_cert,
chunks_record, pain_points);
}
}
}
// Initialize vectors with beginning DawgInfos.
for (int i = 0; i < beginning_active_dawgs_->size(); ++i) {
int dawg_index = (*beginning_active_dawgs_)[i].dawg_index;
if (dawg_index <= dict_->GetMaxFixedLengthDawgIndex() &&
dawg_index >= kMinFixedLengthDawgLength) {
*fixed_length_beginning_active_dawgs_ += (*beginning_active_dawgs_)[i];
}
}
}
// Fill prev_word_str_ with the last language_model_ngram_order
// unichars from prev_word.
if (prev_word != NULL && prev_word->unichar_string() != NULL) {
} else {
}
const char *str_ptr = prev_word_str_.string();
const char *str_end = str_ptr + prev_word_str_.length();
int step;
while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
str_ptr += step;
}
ASSERT_HOST(str_ptr == str_end);
}
// Initialize blamer-related information: map character boxes recorded in
// blamer_bundle->norm_truth_word to the corresponding i,j indices in the
// ratings matrix. We expect this step to succeed, since when running the
// chopper we checked that the correct chops are present.
if (blamer_bundle != NULL &&
blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
blamer_bundle->truth_has_char_boxes) {
STRING blamer_debug;
blamer_debug += "Blamer computing correct_segmentation_cols\n";
int curr_box_col = 0;
int next_box_col = 0;
TBLOB *blob = chunks_record->chunks;
inT16 next_box_x = (blob != NULL) ? blob->bounding_box().right() : 0;
for (int truth_idx = 0;
blob != NULL && truth_idx < blamer_bundle->norm_truth_word.length();
blob = blob->next) {
++next_box_col;
inT16 curr_box_x = next_box_x;
if (blob->next != NULL) next_box_x = blob->next->bounding_box().right();
inT16 truth_x = blamer_bundle->norm_truth_word.BlobBox(truth_idx).right();
blamer_debug.add_str_int("Box x coord vs. truth: ", curr_box_x);
blamer_debug.add_str_int(" ", truth_x);
blamer_debug += "\n";
if (curr_box_x > (truth_x + blamer_bundle->norm_box_tolerance)) {
break; // failed to find a matching box
} else if (curr_box_x >=
(truth_x - blamer_bundle->norm_box_tolerance) && // matched
(blob->next == NULL || // next box can't be included
next_box_x > truth_x + blamer_bundle->norm_box_tolerance)) {
blamer_bundle->correct_segmentation_cols.push_back(curr_box_col);
blamer_bundle->correct_segmentation_rows.push_back(next_box_col-1);
++truth_idx;
blamer_debug.add_str_int("col=", curr_box_col);
blamer_debug.add_str_int(" row=", next_box_col-1);
blamer_debug += "\n";
curr_box_col = next_box_col;
}
}
if (blob != NULL || // trailing blobs
blamer_bundle->correct_segmentation_cols.length() !=
blamer_bundle->norm_truth_word.length()) {
blamer_debug.add_str_int("Blamer failed to find correct segmentation"
" (tolerance=", blamer_bundle->norm_box_tolerance);
if (blob == NULL) blamer_debug += " blob == NULL";
blamer_debug += ")\n";
blamer_debug.add_str_int(
" path length ", blamer_bundle->correct_segmentation_cols.length());
blamer_debug.add_str_int(" vs. truth ",
blamer_bundle->norm_truth_word.length());
blamer_debug += "\n";
blamer_bundle->SetBlame(IRR_UNKNOWN, blamer_debug, NULL, debug_blamer);
blamer_bundle->correct_segmentation_cols.clear();
blamer_bundle->correct_segmentation_rows.clear();
}
} // end if (blamer_bundle != NULL)
// Start a new hypothesis list for this run of segmentation search.
if (blamer_bundle != NULL) {
}
}
bool tesseract::LanguageModel::IsFragment ( BLOB_CHOICE b)
inlineprotected

Definition at line 459 of file language_model.h.

bool tesseract::LanguageModel::IsHan ( int  script_id)
inlineprotected

Definition at line 463 of file language_model.h.

{
return ((dict_->getUnicharset().han_sid() !=
(script_id == dict_->getUnicharset().han_sid()));
}
bool tesseract::LanguageModel::NonAlphaOrDigitMiddle ( int  col,
int  row,
int  dimension,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 452 of file language_model.h.

{
return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
!dict_->getUnicharset().get_isdigit(unichar_id) &&
col > 0 && row+1 < dimension);
}
void tesseract::LanguageModel::PrintViterbiStateEntry ( const char *  msg,
ViterbiStateEntry vse,
BLOB_CHOICE b,
CHUNKS_RECORD chunks_record 
)
protected

Definition at line 669 of file language_model.cpp.

{
tprintf("%s ViterbiStateEntry %p with ratings_sum=%.4f length=%d cost=%.4f",
msg, vse, vse->ratings_sum, vse->length, vse->cost);
if (vse->top_choice_flags) {
tprintf(" top_choice_flags=0x%x", vse->top_choice_flags);
}
if (!vse->Consistent()) {
tprintf(" inconsistent=(punc %d case %d chartype %d script %d)\n",
vse->consistency_info.NumInconsistentPunc(),
vse->consistency_info.NumInconsistentCase(),
vse->consistency_info.NumInconsistentChartype(),
vse->consistency_info.inconsistent_script);
}
if (vse->dawg_info) tprintf(" permuter=%d", vse->dawg_info->permuter);
if (vse->ngram_info) {
tprintf(" ngram_cost=%g context=%s ngram pruned=%d",
vse->ngram_info->ngram_cost,
vse->ngram_info->context.string(),
vse->ngram_info->pruned);
}
if (vse->associate_stats.shape_cost > 0.0f) {
tprintf(" shape_cost=%g", vse->associate_stats.shape_cost);
}
STRING wd_str;
WERD_CHOICE *wd = ConstructWord(b, vse, chunks_record,
wd->string_and_lengths(&wd_str, NULL);
delete wd;
tprintf(" str=%s", wd_str.string());
}
tprintf("\n");
}
bool tesseract::LanguageModel::ProblematicPath ( const ViterbiStateEntry vse,
UNICHAR_ID  unichar_id,
bool  word_end 
)
protected

Definition at line 381 of file language_model.cpp.

{
// The path is problematic if it is inconsistent and has a parent that
// is consistent (or a NULL parent).
if (!vse.Consistent() && (vse.parent_vse == NULL ||
vse.parent_vse->Consistent())) {
tprintf("ProblematicPath: inconsistent\n");
}
return true;
}
// The path is problematic if it does not represent a dictionary word,
// while its parent does.
if (vse.dawg_info == NULL &&
(vse.parent_vse == NULL || vse.parent_vse->dawg_info != NULL)) {
tprintf("ProblematicPath: dict word terminated\n");
}
return true;
}
// The path is problematic if ngram info indicates that this path is
// an unlikely sequence of character, while its parent is does not have
// extreme dips in ngram probabilities.
if (vse.ngram_info != NULL && vse.ngram_info->pruned &&
(vse.parent_vse == NULL || !vse.parent_vse->ngram_info->pruned)) {
tprintf("ProblematicPath: small ngram prob\n");
}
return true;
}
// The path is problematic if there is a non-alpha character in the
// middle of the path (unless it is a digit in the middle of a path
// that represents a number).
if ((vse.parent_vse != NULL) && !word_end && // is middle
!(dict_->getUnicharset().get_isalpha(unichar_id) || // alpha
(dict_->getUnicharset().get_isdigit(unichar_id) && // ok digit
vse.dawg_info != NULL && vse.dawg_info->permuter == NUMBER_PERM))) {
tprintf("ProblematicPath: non-alpha middle\n");
}
return true;
}
return false;
}
bool tesseract::LanguageModel::PrunablePath ( LanguageModelFlagsType  top_choice_flags,
const LanguageModelDawgInfo dawg_info 
)
inlineprotected

Definition at line 719 of file language_model.h.

{
if (top_choice_flags) return false;
if (dawg_info != NULL &&
(dawg_info->permuter == SYSTEM_DAWG_PERM ||
dawg_info->permuter == USER_DAWG_PERM ||
dawg_info->permuter == FREQ_DAWG_PERM) &&
dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
return true;
}
void tesseract::LanguageModel::UpdateBestChoice ( BLOB_CHOICE b,
ViterbiStateEntry vse,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1184 of file language_model.cpp.

{
int i;
BLOB_CHOICE_LIST_VECTOR temp_best_char_choices(vse->length);
for (i = 0; i < vse->length; ++i) {
temp_best_char_choices.push_back(NULL);
}
float *certainties = new float[vse->length];
STATE temp_state;
// The fraction of letters in the path that are "covered" by dawgs.
// For space delimited languages this number will be 0.0 for nonwords
// and 1.0 for dictionary words. For non-space delimited languages
// this number will be in the [0.0, 1.0] range.
float dawg_score;
bool truth_path;
WERD_CHOICE *word = ConstructWord(b, vse, chunks_record,
&temp_best_char_choices, certainties,
&dawg_score, &temp_state,
blamer_bundle, &truth_path);
ASSERT_HOST(word != NULL);
bool not_blaming =
(blamer_bundle == NULL || !blamer_bundle->segsearch_is_looking_for_blame);
// Log new segmentation (for dict_->LogNewChoice()).
if (not_blaming) {
PIECES_STATE pieces_widths;
bin_to_pieces(&temp_state, chunks_record->ratings->dimension() - 1,
pieces_widths);
dict_->LogNewSegmentation(pieces_widths);
}
STRING word_str;
word->string_and_lengths(&word_str, NULL);
tprintf("UpdateBestChoice() constructed word %s\n", word_str.string());
};
// Update raw_choice if needed.
if ((vse->top_choice_flags & kSmallestRatingFlag) &&
word->rating() < best_choice_bundle->raw_choice->rating() &&
not_blaming) {
dict_->LogNewChoice(1.0, certainties, true, word, temp_best_char_choices);
*(best_choice_bundle->raw_choice) = *word;
best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
}
// When working with non-space delimited languages we re-adjust the cost
// taking into account the final dawg_score and a more precise shape cost.
// While constructing the paths we assume that they are all dictionary words
// (since a single character would be a valid dictionary word). At the end
// we compute dawg_score which reflects how many characters on the path are
// "covered" by dictionary words of length > 1.
if (vse->associate_stats.full_wh_ratio_var != 0.0f ||
(dict_->GetMaxFixedLengthDawgIndex() >= 0 && dawg_score < 1.0f)) {
vse->ratings_sum, vse->length, dawg_score, vse->dawg_info,
vse->ngram_info, vse->consistency_info, vse->associate_stats,
vse->parent_vse);
tprintf("Updated vse cost to %g (dawg_score %g full_wh_ratio_var %g)\n",
vse->cost, dawg_score, vse->associate_stats.full_wh_ratio_var);
}
}
// Update best choice and best char choices if needed.
// TODO(daria): re-write AcceptableChoice() and NoDangerousAmbig()
// to fit better into the new segmentation search.
word->set_rating(vse->cost);
if (word->rating() < best_choice_bundle->best_choice->rating() &&
not_blaming) {
vse->ngram_info->ngram_cost :
vse->ratings_sum),
certainties, false, word, temp_best_char_choices);
// Since the rating of the word could have been modified by
// Dict::LogNewChoice() - check again.
if (word->rating() < best_choice_bundle->best_choice->rating()) {
bool modified_blobs; // not used
DANGERR fixpt;
if (dict_->AcceptableChoice(&temp_best_char_choices, word, &fixpt,
ASSOCIATOR_CALLER, &modified_blobs) &&
AcceptablePath(*vse)) {
}
// Update best_choice_bundle.
*(best_choice_bundle->best_choice) = *word;
best_choice_bundle->updated = true;
best_choice_bundle->best_char_choices->delete_data_pointers();
best_choice_bundle->best_char_choices->clear();
for (i = 0; i < temp_best_char_choices.size(); ++i) {
BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
cc_list->deep_copy(temp_best_char_choices[i], &BLOB_CHOICE::deep_copy);
best_choice_bundle->best_char_choices->push_back(cc_list);
}
best_choice_bundle->best_state->part2 = temp_state.part2;
best_choice_bundle->best_state->part1 = temp_state.part1;
tprintf("Updated best choice\n");
print_state("New state ", best_choice_bundle->best_state,
chunks_record->ratings->dimension()-1);
}
// Update hyphen state if we are dealing with a dictionary word.
if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
if (dict_->has_hyphen_end(*word)) {
} else {
}
}
best_choice_bundle->best_vse = vse;
best_choice_bundle->best_b = b;
best_choice_bundle->fixpt = fixpt;
if (blamer_bundle != NULL) {
(vse->dawg_info != NULL &&
(vse->top_choice_flags));
}
}
}
if (blamer_bundle != NULL) {
// Record the current hypothesis in params_training_bundle.
ParamsTrainingHypothesis &hyp =
word->string_and_lengths(&(hyp.str), NULL);
ExtractRawFeaturesFromPath(*vse, hyp.features);
if (truth_path && // update best truth path rating
word->rating() < blamer_bundle->best_correctly_segmented_rating) {
blamer_bundle->best_correctly_segmented_rating = word->rating();
}
}
// Clean up.
delete[] certainties;
delete word;
}
void tesseract::LanguageModel::UpdateCoveredByFixedLengthDawgs ( const DawgInfoVector active_dawgs,
int  word_index,
int  word_length,
int *  skip,
int *  covered,
float *  dawg_score,
bool *  dawg_score_done 
)
protected

Definition at line 1527 of file language_model.cpp.

{
tprintf("UpdateCoveredByFixedLengthDawgs for index %d skip=%d\n",
word_index, *skip, word_length);
}
if (*skip > 0) {
--(*skip);
} else {
int best_index = -1;
for (int d = 0; d < active_dawgs.size(); ++d) {
int dawg_index = (active_dawgs[d]).dawg_index;
if (dawg_index > dict_->GetMaxFixedLengthDawgIndex()) {
// If active_dawgs of the last ViterbiStateEntry on the path
// contain a non-fixed length dawg, this means that the whole
// path represents a word from a non-fixed length word dawg.
if (word_index == (word_length - 1)) {
*dawg_score = 1.0f;
*dawg_score_done = true;
return;
}
} else if (dawg_index >= kMinFixedLengthDawgLength) {
const Dawg *curr_dawg = dict_->GetDawg(dawg_index);
ASSERT_HOST(curr_dawg != NULL);
if ((active_dawgs[d]).ref != NO_EDGE &&
curr_dawg->end_of_word((active_dawgs[d]).ref) &&
dawg_index > best_index) {
best_index = dawg_index;
}
tprintf("dawg_index %d, ref %d, eow %d\n", dawg_index,
(active_dawgs[d]).ref,
((active_dawgs[d]).ref != NO_EDGE &&
curr_dawg->end_of_word((active_dawgs[d]).ref)));
}
}
} // end for
if (best_index != -1) {
*skip = best_index - 1;
*covered += best_index;
}
} // end if/else skip
if (word_index == 0) {
ASSERT_HOST(*covered <= word_length);
*dawg_score = (static_cast<float>(*covered) /
static_cast<float>(word_length));
*dawg_score_done = true;
}
}
LanguageModelFlagsType tesseract::LanguageModel::UpdateState ( LanguageModelFlagsType  changed,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE_LIST *  parent_list,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 292 of file language_model.cpp.

{
tprintf("\nUpdateState: col=%d row=%d (changed=0x%x parent=%p)\n",
curr_col, curr_row, changed, parent_list);
}
// Initialize helper variables.
bool word_end = (curr_row+1 >= chunks_record->ratings->dimension());
bool just_classified = (changed & kJustClassifiedFlag);
LanguageModelFlagsType new_changed = 0x0;
float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
// Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
ViterbiStateEntry_IT vit;
BLOB_CHOICE_IT c_it(curr_list);
int c_it_counter = 0;
bool first_iteration = true;
BLOB_CHOICE *first_lower = NULL;
BLOB_CHOICE *first_upper = NULL;
GetTopChoiceLowerUpper(changed, curr_list, &first_lower, &first_upper);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
break;
}
// Skip NULL unichars unless it is the only choice.
if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
if (dict_->getUnicharset().get_fragment(c_it.data()->unichar_id())) {
continue; // skip fragments
}
// Set top choice flags.
LanguageModelFlagsType top_choice_flags = 0x0;
if (first_iteration && (changed | kSmallestRatingFlag)) {
top_choice_flags |= kSmallestRatingFlag;
}
if (first_lower == c_it.data()) top_choice_flags |= kLowerCaseFlag;
if (first_upper == c_it.data()) top_choice_flags |= kUpperCaseFlag;
if (parent_list == NULL) { // process the beginning of a word
new_changed |= AddViterbiStateEntry(
top_choice_flags, denom, word_end, curr_col, curr_row,
c_it.data(), NULL, NULL, pain_points, best_path_by_column,
chunks_record, best_choice_bundle, blamer_bundle);
} else { // get viterbi entries from each of the parent BLOB_CHOICEs
BLOB_CHOICE_IT p_it(parent_list);
for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
LanguageModelState *parent_lms =
reinterpret_cast<LanguageModelState *>(
p_it.data()->language_model_state());
if (parent_lms == NULL || parent_lms->viterbi_state_entries.empty()) {
continue;
}
vit.set_to_list(&(parent_lms->viterbi_state_entries));
int vit_counter = 0;
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
// Skip pruned entries and do not look at prunable entries if already
// examined language_model_viterbi_list_max_num_prunable of those.
if (PrunablePath(vit.data()->top_choice_flags,
vit.data()->dawg_info) &&
(language_model_ngram_on && vit.data()->ngram_info->pruned))) {
continue;
}
// Only consider the parent if it has been updated or
// if the current ratings cell has just been classified.
if (!just_classified && !vit.data()->updated) continue;
// Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
// looks good according to the Dawgs or character ngram model.
new_changed |= AddViterbiStateEntry(
top_choice_flags, denom, word_end, curr_col, curr_row,
c_it.data(), p_it.data(), vit.data(), pain_points,
best_path_by_column, chunks_record,
best_choice_bundle, blamer_bundle);
}
} // done looking at parents for this c_it.data()
}
first_iteration = false;
}
return new_changed;
}

Member Data Documentation

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 844 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 827 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::beginning_constraints_
protected

Definition at line 828 of file language_model.h.

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 846 of file language_model.h.

DawgArgs* tesseract::LanguageModel::dawg_args_
protected

Definition at line 786 of file language_model.h.

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 807 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::empty_dawg_info_vec_
protected

Definition at line 830 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::fixed_length_beginning_active_dawgs_
protected

Definition at line 829 of file language_model.h.

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 814 of file language_model.h.

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 803 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kAllChangedFlag = 0xff
static

Definition at line 293 of file language_model.h.

const float tesseract::LanguageModel::kBestChoicePainPointPriorityAdjustment = 0.5f
static

Definition at line 269 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kConsistentFlag = 0x8
static

Definition at line 289 of file language_model.h.

const float tesseract::LanguageModel::kCriticalPainPointPriorityAdjustment = 0.1f
static

Definition at line 270 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kDawgFlag = 0x10
static

Definition at line 290 of file language_model.h.

const float tesseract::LanguageModel::kDefaultPainPointPriorityAdjustment = 2.0f
static

Definition at line 268 of file language_model.h.

const float tesseract::LanguageModel::kInitialPainPointPriorityAdjustment = 5.0f
static

Definition at line 267 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kJustClassifiedFlag = 0x80
static

Definition at line 292 of file language_model.h.

const float tesseract::LanguageModel::kLooseMaxCharWhRatio = 2.5f
static

Definition at line 282 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 287 of file language_model.h.

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 274 of file language_model.h.

const int tesseract::LanguageModel::kMinFixedLengthDawgLength = 2
static

Definition at line 278 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kNgramFlag = 0x20
static

Definition at line 291 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 286 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 288 of file language_model.h.

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 738 of file language_model.h.

int tesseract::LanguageModel::language_model_fixed_length_choices_depth = 3

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

Definition at line 766 of file language_model.h.

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 763 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 752 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 740 of file language_model.h.

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 742 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 758 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 750 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 760 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 755 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 775 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 779 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 781 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 784 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 771 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 769 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 773 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 777 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 783 of file language_model.h.

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 786 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is true)" "entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 745 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 747 of file language_model.h.

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 817 of file language_model.h.

float tesseract::LanguageModel::max_penalty_adjust_
protected

Definition at line 832 of file language_model.h.

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 824 of file language_model.h.

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 825 of file language_model.h.

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 798 of file language_model.h.

GenericVector<bool *> tesseract::LanguageModel::updated_flags_
protected

Definition at line 796 of file language_model.h.


The documentation for this class was generated from the following files: