Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  ** History: Wed Dec 19 16:18:06 1990, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
21 #include "normmatch.h"
22 
23 #include <stdio.h>
24 #include <math.h>
25 
26 #include "classify.h"
27 #include "clusttool.h"
28 #include "const.h"
29 #include "efio.h"
30 #include "emalloc.h"
31 #include "globals.h"
32 #include "helpers.h"
33 #include "normfeat.h"
34 #include "scanutils.h"
35 #include "unicharset.h"
36 #include "params.h"
37 
39 {
40  int NumParams;
43  int NumProtos;
44 };
45 
49 double NormEvidenceOf(register double NormAdj);
50 
51 void PrintNormMatch(FILE *File,
52  int NumParams,
53  PROTOTYPE *Proto,
54  FEATURE Feature);
55 
56 NORM_PROTOS *ReadNormProtos(FILE *File);
57 
62 /* control knobs used to control the normalization adjustment process */
63 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
64 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
65 // Weight of width variance against height and vertical position.
66 const double kWidthErrorWeighting = 0.125;
67 
71 /*---------------------------------------------------------------------------*/
72 namespace tesseract {
74  const FEATURE_STRUCT& feature,
75  BOOL8 DebugMatch) {
76 /*
77  ** Parameters:
78  ** ClassId id of class to match against
79  ** Feature character normalization feature
80  ** DebugMatch controls dump of debug info
81  ** Globals:
82  ** NormProtos character normalization prototypes
83  ** Operation: This routine compares Features against each character
84  ** normalization proto for ClassId and returns the match
85  ** rating of the best match.
86  ** Return: Best match rating for Feature against protos of ClassId.
87  ** Exceptions: none
88  ** History: Wed Dec 19 16:56:12 1990, DSJ, Created.
89  */
90  LIST Protos;
91  FLOAT32 BestMatch;
92  FLOAT32 Match;
93  FLOAT32 Delta;
94  PROTOTYPE *Proto;
95  int ProtoId;
96 
97  /* handle requests for classification as noise */
98  if (ClassId == NO_CLASS) {
99  /* kludge - clean up constants and make into control knobs later */
100  Match = (feature.Params[CharNormLength] *
101  feature.Params[CharNormLength] * 500.0 +
102  feature.Params[CharNormRx] *
103  feature.Params[CharNormRx] * 8000.0 +
104  feature.Params[CharNormRy] *
105  feature.Params[CharNormRy] * 8000.0);
106  return (1.0 - NormEvidenceOf (Match));
107  }
108 
109  BestMatch = MAX_FLOAT32;
110  Protos = NormProtos->Protos[ClassId];
111 
112  if (DebugMatch) {
113  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
114  }
115 
116  ProtoId = 0;
117  iterate(Protos) {
118  Proto = (PROTOTYPE *) first_node (Protos);
119  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
120  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
121  if (DebugMatch) {
122  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
123  Proto->Mean[CharNormY], Delta,
124  Proto->Weight.Elliptical[CharNormY], Match);
125  }
126  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
127  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
128  if (DebugMatch) {
129  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
130  Proto->Mean[CharNormRx], Delta,
131  Proto->Weight.Elliptical[CharNormRx], Match);
132  }
133  // Ry is width! See intfx.cpp.
134  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
135  if (DebugMatch) {
136  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
137  Proto->Mean[CharNormRy], Delta,
138  Proto->Weight.Elliptical[CharNormRy]);
139  }
140  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
141  Delta *= kWidthErrorWeighting;
142  Match += Delta;
143  if (DebugMatch) {
144  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
145  Match, Match / classify_norm_adj_midpoint,
146  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
147  }
148 
149  if (Match < BestMatch)
150  BestMatch = Match;
151 
152  ProtoId++;
153  }
154  return 1.0 - NormEvidenceOf(BestMatch);
155 } /* ComputeNormMatch */
156 
158  if (NormProtos != NULL) {
159  for (int i = 0; i < NormProtos->NumProtos; i++)
163  Efree(NormProtos);
164  NormProtos = NULL;
165  }
166 }
167 } // namespace tesseract
168 
172 /**********************************************************************
173  * NormEvidenceOf
174  *
175  * Return the new type of evidence number corresponding to this
176  * normalization adjustment. The equation that represents the transform is:
177  * 1 / (1 + (NormAdj / midpoint) ^ curl)
178  **********************************************************************/
179 double NormEvidenceOf(register double NormAdj) {
180  NormAdj /= classify_norm_adj_midpoint;
181 
182  if (classify_norm_adj_curl == 3)
183  NormAdj = NormAdj * NormAdj * NormAdj;
184  else if (classify_norm_adj_curl == 2)
185  NormAdj = NormAdj * NormAdj;
186  else
187  NormAdj = pow (NormAdj, classify_norm_adj_curl);
188  return (1.0 / (1.0 + NormAdj));
189 }
190 
191 
192 /*---------------------------------------------------------------------------*/
193 void PrintNormMatch(FILE *File,
194  int NumParams,
195  PROTOTYPE *Proto,
196  FEATURE Feature) {
197 /*
198  ** Parameters:
199  ** File open text file to dump match debug info to
200  ** NumParams # of parameters in proto and feature
201  ** Proto[] array of prototype parameters
202  ** Feature[] array of feature parameters
203  ** Globals: none
204  ** Operation: This routine dumps out detailed normalization match info.
205  ** Return: none
206  ** Exceptions: none
207  ** History: Wed Jan 2 09:49:35 1991, DSJ, Created.
208  */
209  int i;
210  FLOAT32 ParamMatch;
211  FLOAT32 TotalMatch;
212 
213  for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
214  ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
215  StandardDeviation(Proto, i);
216 
217  fprintf (File, " %6.1f", ParamMatch);
218 
219  if (i == CharNormY || i == CharNormRx)
220  TotalMatch += ParamMatch * ParamMatch;
221  }
222  fprintf (File, " --> %6.1f (%4.2f)\n",
223  TotalMatch, NormEvidenceOf (TotalMatch));
224 
225 } /* PrintNormMatch */
226 
227 
228 /*---------------------------------------------------------------------------*/
229 namespace tesseract {
230 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
231 /*
232  ** Parameters:
233  ** File open text file to read normalization protos from
234  ** Globals: none
235  ** Operation: This routine allocates a new data structure to hold
236  ** a set of character normalization protos. It then fills in
237  ** the data structure by reading from the specified File.
238  ** Return: Character normalization protos.
239  ** Exceptions: none
240  ** History: Wed Dec 19 16:38:49 1990, DSJ, Created.
241  */
243  int i;
244  char unichar[2 * UNICHAR_LEN + 1];
245  UNICHAR_ID unichar_id;
246  LIST Protos;
247  int NumProtos;
248 
249  /* allocate and initialization data structure */
250  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
251  NormProtos->NumProtos = unicharset.size();
252  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
253  for (i = 0; i < NormProtos->NumProtos; i++)
254  NormProtos->Protos[i] = NIL_LIST;
255 
256  /* read file header and save in data structure */
257  NormProtos->NumParams = ReadSampleSize (File);
258  NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
259 
260  /* read protos for each class into a separate list */
261  while ((end_offset < 0 || ftell(File) < end_offset) &&
262  fscanf(File, "%s %d", unichar, &NumProtos) == 2) {
263  if (unicharset.contains_unichar(unichar)) {
264  unichar_id = unicharset.unichar_to_id(unichar);
265  Protos = NormProtos->Protos[unichar_id];
266  for (i = 0; i < NumProtos; i++)
267  Protos =
268  push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
269  NormProtos->Protos[unichar_id] = Protos;
270  } else {
271  cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
272  unichar);
273  for (i = 0; i < NumProtos; i++)
274  FreePrototype(ReadPrototype (File, NormProtos->NumParams));
275  }
276  SkipNewline(File);
277  }
278  return (NormProtos);
279 } /* ReadNormProtos */
280 } // namespace tesseract