"Fossies" - the Fresh Open Source Software Archive

Member "tesseract-ocr/doc/html/tess__lang__model_8h_source.html" (26 Oct 2012, 34213 Bytes) of package /linux/misc/old/tesseract-ocr-3.02.02-doc-html.tar.gz:


Caution: In this restricted "Fossies" environment the current HTML page may not be correctly presentated and may have some non-functional links. You can here alternatively try to browse the pure source code or just view or download the uninterpreted raw source code. If the rendering is insufficient you may try to find and view the page on the tesseract-ocr-3.02.02-doc-html.tar.gz project site itself.

Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tess_lang_model.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tess_lang_model.h
3  * Description: Declaration of the Tesseract Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESS_LANG_MODEL_H
21 #define TESS_LANG_MODEL_H
22 
23 #include <string>
24 
25 #include "char_altlist.h"
26 #include "cube_reco_context.h"
27 #include "cube_tuning_params.h"
28 #include "dict.h"
29 #include "lang_model.h"
30 #include "tessdatamanager.h"
31 #include "tess_lang_mod_edge.h"
32 
33 namespace tesseract {
34 
35 const int kStateCnt = 4;
36 const int kNumLiteralCnt = 5;
37 
38 class TessLangModel : public LangModel {
39  public:
40  TessLangModel(const string &lm_params,
41  const string &data_file_path,
42  bool load_system_dawg,
43  TessdataManager *tessdata_manager,
44  CubeRecoContext *cntxt);
46  if (word_dawgs_ != NULL) {
47  word_dawgs_->delete_data_pointers();
48  delete word_dawgs_;
49  }
50  }
51 
52  // returns a pointer to the root of the language model
53  inline TessLangModEdge *Root() {
54  return NULL;
55  }
56 
57  // The general fan-out generation function. Returns the list of edges
58  // fanning-out of the specified edge and their count. If an AltList is
59  // specified, only the class-ids with a minimum cost are considered
60  LangModEdge **GetEdges(CharAltList *alt_list,
61  LangModEdge *edge,
62  int *edge_cnt);
63  // Determines if a sequence of 32-bit chars is valid in this language model
64  // starting from the root. If the eow_flag is ON, also checks for
65  // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
66  // edge
67  bool IsValidSequence(const char_32 *sequence, bool eow_flag,
68  LangModEdge **final_edge = NULL);
69  bool IsLeadingPunc(char_32 ch);
70  bool IsTrailingPunc(char_32 ch);
71  bool IsDigit(char_32 ch);
72 
73  void RemoveInvalidCharacters(string *lm_str);
74  private:
75  // static LM state machines
76  static const Dawg *ood_dawg_;
77  static const Dawg *number_dawg_;
78  static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
79  static const int num_max_repeat_[kStateCnt];
80  // word_dawgs_ should only be loaded if cube has its own version of the
81  // unicharset (different from the one used by tesseract) and therefore
82  // can not use the dawgs loaded for tesseract (since the unichar ids
83  // encoded in the dawgs differ).
84  DawgVector *word_dawgs_;
85 
86  static int max_edge_;
87  static int max_ood_shape_cost_;
88 
89  // remaining language model elements needed by cube. These get loaded from
90  // the .lm file
91  string lead_punc_;
92  string trail_punc_;
93  string num_lead_punc_;
94  string num_trail_punc_;
95  string operators_;
96  string digits_;
97  string alphas_;
98  // String of characters in RHS of each line of <lang>.cube.lm
99  // Each element is hard-coded to correspond to a specific token type
100  // (see LoadLangModelElements)
101  string *literal_str_[kNumLiteralCnt];
102  // Recognition context needed to access language properties
103  // (case, cursive,..)
104  CubeRecoContext *cntxt_;
105  bool has_case_;
106 
107  // computes and returns the edges that fan out of an edge ref
108  int FanOut(CharAltList *alt_list,
109  const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
110  const char_32 *str, bool root_flag, LangModEdge **edge_array);
111  // generate edges from an NULL terminated string
112  // (used for punctuation, operators and digits)
113  int Edges(const char *strng, const Dawg *dawg,
114  EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
115  LangModEdge **edge_array);
116  // Generate the edges fanning-out from an edge in the number state machine
117  int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
118  // Generate OOD edges
119  int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
120  EDGE_REF edge_ref_mask, LangModEdge **edge_array);
121  // Cleanup an edge array
122  void FreeEdges(int edge_cnt, LangModEdge **edge_array);
123  // Determines if a sequence of 32-bit chars is valid in this language model
124  // starting from the specified edge. If the eow_flag is ON, also checks for
125  // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
126  // edge
127  bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
128  bool eow_flag, LangModEdge **final_edge);
129  // Parse language model elements from the given string, which should
130  // have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext
131  bool LoadLangModelElements(const string &lm_params);
132 
133  // Returns the number of word Dawgs in the language model.
134  int NumDawgs() const;
135 
136  // Returns the dawgs with the given index from either the dawgs
137  // stored by the Tesseract object, or the word_dawgs_.
138  const Dawg *GetDawg(int index) const;
139 };
140 } // tesseract
141 
142 #endif // TESS_LANG_MODEL_H