Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
devanagari_processing.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: devanagari_processing.cpp
3  * Description: Methods to process images containing devanagari symbols,
4  * prior to classification.
5  * Author: Shobhit Saxena
6  * Created: Mon Nov 17 20:26:01 IST 2008
7  *
8  * (C) Copyright 2008, Google Inc.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "devanagari_processing.h"
22 #include "allheaders.h"
23 #include "tordmain.h"
24 #include "img.h"
25 #include "statistc.h"
26 
27 // Flags controlling the debugging information for shiro-rekha splitting
28 // strategies.
30  "Debug level for split shiro-rekha process.");
31 
33  "Whether to create a debug image for split shiro-rekha process.");
34 
35 namespace tesseract {
36 
38  orig_pix_ = NULL;
39  segmentation_block_list_ = NULL;
40  splitted_image_ = NULL;
41  global_xheight_ = kUnspecifiedXheight;
42  perform_close_ = false;
43  debug_image_ = NULL;
44  pageseg_split_strategy_ = NO_SPLIT;
45  ocr_split_strategy_ = NO_SPLIT;
46 }
47 
49  Clear();
50 }
51 
53  pixDestroy(&orig_pix_);
54  pixDestroy(&splitted_image_);
55  pageseg_split_strategy_ = NO_SPLIT;
56  ocr_split_strategy_ = NO_SPLIT;
57  pixDestroy(&debug_image_);
58  segmentation_block_list_ = NULL;
59  global_xheight_ = kUnspecifiedXheight;
60  perform_close_ = false;
61 }
62 
63 // This method dumps a debug image to the specified location.
65  pixWrite(filename, debug_image_, IFF_PNG);
66 }
67 
68 // On setting the input image, a clone of it is owned by this class.
70  if (orig_pix_) {
71  pixDestroy(&orig_pix_);
72  }
73  orig_pix_ = pixClone(pix);
74 }
75 
76 // Top-level method to perform splitting based on current settings.
77 // Returns true if a split was actually performed.
78 // split_for_pageseg should be true if the splitting is being done prior to
79 // page segmentation. This mode uses the flag
80 // pageseg_devanagari_split_strategy to determine the splitting strategy.
81 bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
82  SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
83  ocr_split_strategy_;
84  if (split_strategy == NO_SPLIT) {
85  return false; // Nothing to do.
86  }
87  ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
88  split_strategy == MAXIMAL_SPLIT);
89  ASSERT_HOST(orig_pix_);
91  tprintf("Splitting shiro-rekha ...\n");
92  tprintf("Split strategy = %s\n",
93  split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
94  tprintf("Initial pageseg available = %s\n",
95  segmentation_block_list_ ? "yes" : "no");
96  }
97  // Create a copy of original image to store the splitting output.
98  pixDestroy(&splitted_image_);
99  splitted_image_ = pixCopy(NULL, orig_pix_);
100 
101  // Initialize debug image if required.
103  pixDestroy(&debug_image_);
104  debug_image_ = pixConvertTo32(orig_pix_);
105  }
106 
107  // Determine all connected components in the input image. A close operation
108  // may be required prior to this, depending on the current settings.
109  Pix* pix_for_ccs = pixClone(orig_pix_);
110  if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
111  !segmentation_block_list_) {
112  if (devanagari_split_debuglevel > 0) {
113  tprintf("Performing a global close operation..\n");
114  }
115  // A global measure is available for xheight, but no local information
116  // exists.
117  pixDestroy(&pix_for_ccs);
118  pix_for_ccs = pixCopy(NULL, orig_pix_);
119  PerformClose(pix_for_ccs, global_xheight_);
120  }
121  Pixa* ccs;
122  Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
123  boxaDestroy(&tmp_boxa);
124  pixDestroy(&pix_for_ccs);
125 
126  // Iterate over all connected components. Get their bounding boxes and clip
127  // out the image regions corresponding to these boxes from the original image.
128  // Conditionally run splitting on each of them.
129  Boxa* regions_to_clear = boxaCreate(0);
130  for (int i = 0; i < pixaGetCount(ccs); ++i) {
131  Box* box = ccs->boxa->box[i];
132  Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
133  ASSERT_HOST(word_pix);
134  int xheight = GetXheightForCC(box);
135  if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
137  pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
138  }
139  // If some xheight measure is available, attempt to pre-eliminate small
140  // blobs from the shiro-rekha process. This is primarily to save the CCs
141  // corresponding to punctuation marks/small dots etc which are part of
142  // larger graphemes.
143  if (xheight == kUnspecifiedXheight ||
144  (box->w > xheight / 3 && box->h > xheight / 2)) {
145  SplitWordShiroRekha(split_strategy, word_pix, xheight,
146  box->x, box->y, regions_to_clear);
147  } else if (devanagari_split_debuglevel > 0) {
148  tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
149  box->x, box->y, box->w, box->h);
150  }
151  pixDestroy(&word_pix);
152  }
153  // Actually clear the boxes now.
154  for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
155  Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
156  pixClearInRect(splitted_image_, box);
157  boxDestroy(&box);
158  }
159  boxaDestroy(&regions_to_clear);
160  pixaDestroy(&ccs);
162  DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" :
163  "ocr_split_debug.png");
164  }
165  return true;
166 }
167 
168 // Method to perform a close operation on the input image. The xheight
169 // estimate decides the size of sel used.
170 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
171  pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
172 }
173 
174 // This method resolves the cc bbox to a particular row and returns the row's
175 // xheight.
176 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
177  if (!segmentation_block_list_) {
178  return global_xheight_;
179  }
180  // Compute the box coordinates in Tesseract's coordinate system.
181  TBOX bbox(cc_bbox->x,
182  pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
183  cc_bbox->x + cc_bbox->w,
184  pixGetHeight(orig_pix_) - cc_bbox->y - 1);
185  // Iterate over all blocks.
186  BLOCK_IT block_it(segmentation_block_list_);
187  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
188  BLOCK* block = block_it.data();
189  // Iterate over all rows in the block.
190  ROW_IT row_it(block->row_list());
191  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
192  ROW* row = row_it.data();
193  if (!row->bounding_box().major_overlap(bbox)) {
194  continue;
195  }
196  // Row could be skewed, warped, etc. Use the position of the box to
197  // determine the baseline position of the row for that x-coordinate.
198  // Create a square TBOX whose baseline's mid-point lies at this point
199  // and side is row's xheight. Take the overlap of this box with the input
200  // box and check if it is a 'major overlap'. If so, this box lies in this
201  // row. In that case, return the xheight for this row.
202  float box_middle = 0.5 * (bbox.left() + bbox.right());
203  int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
204  TBOX test_box(box_middle - row->x_height() / 2,
205  baseline,
206  box_middle + row->x_height() / 2,
207  static_cast<int>(baseline + row->x_height()));
208  // Compute overlap. If it is is a major overlap, this is the right row.
209  if (bbox.major_overlap(test_box)) {
210  return row->x_height();
211  }
212  }
213  }
214  // No row found for this bbox.
215  return kUnspecifiedXheight;
216 }
217 
218 // Returns a list of regions (boxes) which should be cleared in the original
219 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
220 // (or less) word only. Xheight measure could be the global estimate, the row
221 // estimate, or unspecified. If unspecified, over splitting may occur, since a
222 // conservative estimate of stroke width along with an associated multiplier
223 // is used in its place. It is advisable to have a specified xheight when
224 // splitting for classification/training.
225 // A vertical projection histogram of all the on-pixels in the input pix is
226 // computed. The maxima of this histogram is regarded as an approximate location
227 // of the shiro-rekha. By descending on the maxima's peak on both sides,
228 // stroke width of shiro-rekha is estimated.
229 // A horizontal projection histogram is computed for a sub-image of the input
230 // image, which extends from just below the shiro-rekha down to a certain
231 // leeway. The leeway depends on the input xheight, if provided, else a
232 // conservative multiplier on approximate stroke width is used (which may lead
233 // to over-splitting).
234 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
235  Pix* pix,
236  int xheight,
237  int word_left,
238  int word_top,
239  Boxa* regions_to_clear) {
240  if (split_strategy == NO_SPLIT) {
241  return;
242  }
243  int width = pixGetWidth(pix);
244  int height = pixGetHeight(pix);
245  // Statistically determine the yextents of the shiro-rekha.
246  int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
247  GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
248  &shirorekha_ylevel);
249  // Since the shiro rekha is also a stroke, its width is equal to the stroke
250  // width.
251  int stroke_width = shirorekha_bottom - shirorekha_top + 1;
252 
253  // Some safeguards to protect CCs we do not want to be split.
254  // These are particularly useful when the word wasn't eliminated earlier
255  // because xheight information was unavailable.
256  if (shirorekha_ylevel > height / 2) {
257  // Shirorekha shouldn't be in the bottom half of the word.
258  if (devanagari_split_debuglevel > 0) {
259  tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
260  word_left, word_top);
261  }
262  return;
263  }
264  if (stroke_width > height / 3) {
265  // Even the boldest of fonts shouldn't do this.
266  if (devanagari_split_debuglevel > 0) {
267  tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
268  word_left, word_top);
269  }
270  return;
271  }
272 
273  // Clear the ascender and descender regions of the word.
274  // Obtain a vertical projection histogram for the resulting image.
275  Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
276  width, 5 * stroke_width / 3);
277  Pix* word_in_xheight = pixCopy(NULL, pix);
278  pixClearInRect(word_in_xheight, box_to_clear);
279  // Also clear any pixels which are below shirorekha_bottom + some leeway.
280  // The leeway is set to xheight if the information is available, else it is a
281  // multiplier applied to the stroke width.
282  int leeway_to_keep = stroke_width * 3;
283  if (xheight != kUnspecifiedXheight) {
284  // This is because the xheight-region typically includes the shiro-rekha
285  // inside it, i.e., the top of the xheight range corresponds to the top of
286  // shiro-rekha.
287  leeway_to_keep = xheight - stroke_width;
288  }
289  box_to_clear->y = shirorekha_bottom + leeway_to_keep;
290  box_to_clear->h = height - box_to_clear->y;
291  pixClearInRect(word_in_xheight, box_to_clear);
292  boxDestroy(&box_to_clear);
293 
294  PixelHistogram vert_hist;
295  vert_hist.ConstructVerticalCountHist(word_in_xheight);
296  pixDestroy(&word_in_xheight);
297 
298  // If the number of black pixel in any column of the image is less than a
299  // fraction of the stroke width, treat it as noise / a stray mark. Perform
300  // these changes inside the vert_hist data itself, as that is used later on as
301  // a bit vector for the final split decision at every column.
302  for (int i = 0; i < width; ++i) {
303  if (vert_hist.hist()[i] <= stroke_width / 4)
304  vert_hist.hist()[i] = 0;
305  else
306  vert_hist.hist()[i] = 1;
307  }
308  // In order to split the line at any point, we make sure that the width of the
309  // gap is atleast half the stroke width.
310  int i = 0;
311  int cur_component_width = 0;
312  while (i < width) {
313  if (!vert_hist.hist()[i]) {
314  int j = 0;
315  while (i + j < width && !vert_hist.hist()[i+j])
316  ++j;
317  if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
318  // Perform a shiro-rekha split. The intervening region lies from i to
319  // i+j-1.
320  // A minimal single-pixel split makes the estimation of intra- and
321  // inter-word spacing easier during page layout analysis,
322  // whereas a maximal split may be needed for OCR, depending on
323  // how the engine was trained.
324  bool minimal_split = (split_strategy == MINIMAL_SPLIT);
325  int split_width = minimal_split ? 1 : j;
326  int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
327  if (!minimal_split || (i != 0 && i + j != width)) {
328  Box* box_to_clear =
329  boxCreate(word_left + split_left,
330  word_top + shirorekha_top - stroke_width / 3,
331  split_width,
332  5 * stroke_width / 3);
333  if (box_to_clear) {
334  boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
335  // Mark this in the debug image if needed.
337  pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
338  }
339  boxDestroy(&box_to_clear);
340  cur_component_width = 0;
341  }
342  }
343  }
344  i += j;
345  } else {
346  ++i;
347  ++cur_component_width;
348  }
349  }
350 }
351 
352 // Refreshes the words in the segmentation block list by using blobs in the
353 // input block list.
354 // The segmentation block list must be set.
356  C_BLOB_LIST* new_blobs) {
357  // The segmentation block list must have been specified.
358  ASSERT_HOST(segmentation_block_list_);
359  if (devanagari_split_debuglevel > 0) {
360  tprintf("Before refreshing blobs:\n");
361  PrintSegmentationStats(segmentation_block_list_);
362  tprintf("New Blobs found: %d\n", new_blobs->length());
363  }
364 
365  C_BLOB_LIST not_found_blobs;
366  RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
367  new_blobs,
368  ((devanagari_split_debugimage && debug_image_) ?
369  &not_found_blobs : NULL));
370 
371  if (devanagari_split_debuglevel > 0) {
372  tprintf("After refreshing blobs:\n");
373  PrintSegmentationStats(segmentation_block_list_);
374  }
375  if (devanagari_split_debugimage && debug_image_) {
376  // Plot out the original blobs for which no match was found in the new
377  // all_blobs list.
378  C_BLOB_IT not_found_it(&not_found_blobs);
379  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
380  not_found_it.forward()) {
381  C_BLOB* not_found = not_found_it.data();
382  TBOX not_found_box = not_found->bounding_box();
383  Box* box_to_plot = GetBoxForTBOX(not_found_box);
384  pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
385  boxDestroy(&box_to_plot);
386  }
387 
388  // Plot out the blobs unused from all blobs.
389  C_BLOB_IT all_blobs_it(new_blobs);
390  for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
391  all_blobs_it.forward()) {
392  C_BLOB* a_blob = all_blobs_it.data();
393  Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
394  pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
395  boxDestroy(&box_to_plot);
396  }
397  }
398 }
399 
400 // Returns a new box object for the corresponding TBOX, based on the original
401 // image's coordinate system.
402 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
403  return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
404  tbox.width(), tbox.height());
405 }
406 
407 // This method returns the computed mode-height of blobs in the pix.
408 // It also prunes very small blobs from calculation.
410  Boxa* boxa = pixConnComp(pix, NULL, 8);
411  STATS heights(0, pixGetHeight(pix));
412  heights.clear();
413  for (int i = 0; i < boxaGetCount(boxa); ++i) {
414  Box* box = boxaGetBox(boxa, i, L_CLONE);
415  if (box->h >= 3 || box->w >= 3) {
416  heights.add(box->h, 1);
417  }
418  boxDestroy(&box);
419  }
420  boxaDestroy(&boxa);
421  return heights.mode();
422 }
423 
424 // This method returns y-extents of the shiro-rekha computed from the input
425 // word image.
426 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
427  int* shirorekha_top,
428  int* shirorekha_bottom,
429  int* shirorekha_ylevel) {
430  // Compute a histogram from projecting the word on a vertical line.
431  PixelHistogram hist_horiz;
432  hist_horiz.ConstructHorizontalCountHist(word_pix);
433  // Get the ylevel where the top-line exists. This is basically the global
434  // maxima in the horizontal histogram.
435  int topline_onpixel_count = 0;
436  int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
437 
438  // Get the upper and lower extents of the shiro rekha.
439  int thresh = (topline_onpixel_count * 70) / 100;
440  int ulimit = topline_ylevel;
441  int llimit = topline_ylevel;
442  while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
443  --ulimit;
444  while (llimit < word_pix->h && hist_horiz.hist()[llimit] >= thresh)
445  ++llimit;
446 
447  if (shirorekha_top) *shirorekha_top = ulimit;
448  if (shirorekha_bottom) *shirorekha_bottom = llimit;
449  if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
450 }
451 
452 // This method returns the global-maxima for the histogram. The frequency of
453 // the global maxima is returned in count, if specified.
455  int best_value = 0;
456  for (int i = 0; i < length_; ++i) {
457  if (hist_[i] > hist_[best_value]) {
458  best_value = i;
459  }
460  }
461  if (count) {
462  *count = hist_[best_value];
463  }
464  return best_value;
465 }
466 
467 // Methods to construct histograms from images.
469  Clear();
470  int width = pixGetWidth(pix);
471  int height = pixGetHeight(pix);
472  hist_ = new int[width];
473  length_ = width;
474  int wpl = pixGetWpl(pix);
475  l_uint32 *data = pixGetData(pix);
476  for (int i = 0; i < width; ++i)
477  hist_[i] = 0;
478  for (int i = 0; i < height; ++i) {
479  l_uint32 *line = data + i * wpl;
480  for (int j = 0; j < width; ++j)
481  if (GET_DATA_BIT(line, j))
482  ++(hist_[j]);
483  }
484 }
485 
487  Clear();
488  Numa* counts = pixCountPixelsByRow(pix, NULL);
489  length_ = numaGetCount(counts);
490  hist_ = new int[length_];
491  for (int i = 0; i < length_; ++i) {
492  l_int32 val = 0;
493  numaGetIValue(counts, i, &val);
494  hist_[i] = val;
495  }
496  numaDestroy(&counts);
497 }
498 
499 } // namespace tesseract.