Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
context.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: context.c (Formerly context.c)
5  * Description: Context checking functions
6  * Author: Mark Seaman, OCR Technology
7  * Created: Thu Feb 15 11:18:24 1990
8  * Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1990, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 
26 #include "dict.h"
27 #include "tprintf.h"
28 #include "unicharset.h"
29 
30 namespace tesseract {
31 
32 static const int kMinAbsoluteGarbageWordLength = 10;
33 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
34 
35 const int case_state_table[6][4] = { {
36  /* 0. Begining of word */
37  /* P U L D */
38  /* -1. Error on case */
39  0, 1, 5, 4
40  },
41  { /* 1. After initial capital */
42  0, 3, 2, 4
43  },
44  { /* 2. After lower case */
45  0, -1, 2, -1
46  },
47  { /* 3. After upper case */
48  0, 3, -1, 4
49  },
50  { /* 4. After a digit */
51  0, -1, -1, 4
52  },
53  { /* 5. After initial lower case */
54  5, -1, 2, -1
55  },
56  };
57 
58 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
59  int last_state = 0;
60  int state = 0;
61  int x;
62  for (x = 0; x < word.length(); ++x) {
63  UNICHAR_ID ch_id = word.unichar_id(x);
64  if (unicharset.get_isupper(ch_id))
65  state = case_state_table[state][1];
66  else if (unicharset.get_islower(ch_id))
67  state = case_state_table[state][2];
68  else if (unicharset.get_isdigit(ch_id))
69  state = case_state_table[state][3];
70  else
71  state = case_state_table[state][0];
72  if (state == -1) return false;
73  last_state = state;
74  }
75  return state != 5; // single lower is bad
76 }
77 
79  const UNICHARSET &unicharset) {
80  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
81  int num_alphanum = 0;
82  for (int x = 0; x < word.length(); ++x) {
83  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
84  unicharset.get_isdigit(word.unichar_id(x)));
85  }
86  return (static_cast<float>(num_alphanum) /
87  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
88 }
89 
90 } // namespace tesseract