Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tospace.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * tospace.cpp
3  *
4  * Compute fuzzy word spacing thresholds for each row.
5  * I.e. set : max_nonspace
6  * space_threshold
7  * min_space
8  * kern_size
9  * space_size
10  * for each row.
11  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
12  *
13  * Note: functions in this file were originally not members of any
14  * class or enclosed by any namespace. Now they are all static members
15  * of the Textord class.
16  *
17  **********************************************************************/
18 
19 #include "textord.h"
20 #include "mfcpch.h"
21 #include "tovars.h"
22 #include "drawtord.h"
23 #include "textord.h"
24 #include "ndminx.h"
25 #include "statistc.h"
26 
27 // Include automatically generated configuration file if running autoconf.
28 #ifdef HAVE_CONFIG_H
29 #include "config_auto.h"
30 #endif
31 
32 #define MAXSPACING 128 /*max expected spacing in pix */
33 
34 namespace tesseract {
36  ICOORD page_tr, //topright of page
37  TO_BLOCK_LIST *blocks //blocks on page
38  ) {
39  TO_BLOCK_IT block_it; //iterator
40  TO_BLOCK *block; //current block;
41  TO_ROW_IT row_it; //row iterator
42  TO_ROW *row; //current row
43  int block_index; //block number
44  int row_index; //row number
45  //estimated width of real spaces for whole block
46  inT16 block_space_gap_width;
47  //estimated width of non space gaps for whole block
48  inT16 block_non_space_gap_width;
49  BOOL8 old_text_ord_proportional;//old fixed/prop result
50  GAPMAP *gapmap = NULL; //map of big vert gaps in blk
51 
52  block_it.set_to_list (blocks);
53  block_index = 1;
54  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
55  block_it.forward ()) {
56  block = block_it.data ();
57  gapmap = new GAPMAP (block);
58  block_spacing_stats(block,
59  gapmap,
60  old_text_ord_proportional,
61  block_space_gap_width,
62  block_non_space_gap_width);
63  // Make sure relative values of block-level space and non-space gap
64  // widths are reasonable. The ratio of 1:3 is also used in
65  // block_spacing_stats, to corrrect the block_space_gap_width
66  // Useful for arabic and hindi, when the non-space gap width is
67  // often over-estimated and should not be trusted. A similar ratio
68  // is found in block_spacing_stats.
70  (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
71  block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
72  }
73  row_it.set_to_list (block->get_rows ());
74  row_index = 1;
75  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
76  row = row_it.data ();
77  if ((row->pitch_decision == PITCH_DEF_PROP) ||
78  (row->pitch_decision == PITCH_CORR_PROP)) {
79  if ((tosp_debug_level > 0) && !old_text_ord_proportional)
80  tprintf ("Block %d Row %d: Now Proportional\n",
81  block_index, row_index);
82  row_spacing_stats(row,
83  gapmap,
84  block_index,
85  row_index,
86  block_space_gap_width,
87  block_non_space_gap_width);
88  }
89  else {
90  if ((tosp_debug_level > 0) && old_text_ord_proportional)
91  tprintf
92  ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
93  block_index, row_index, row->pitch_decision,
94  row->fixed_pitch);
95  }
96 #ifndef GRAPHICS_DISABLED
99 #endif
100  row_index++;
101  }
102  delete gapmap;
103  block_index++;
104  }
105 }
106 
107 
108 /*************************************************************************
109  * block_spacing_stats()
110  *************************************************************************/
111 
112 void Textord::block_spacing_stats(
113  TO_BLOCK *block,
114  GAPMAP *gapmap,
115  BOOL8 &old_text_ord_proportional,
116  inT16 &block_space_gap_width, //resulting estimate
117  inT16 &block_non_space_gap_width //resulting estimate
118  ) {
119  TO_ROW_IT row_it; //row iterator
120  TO_ROW *row; //current row
121  BLOBNBOX_IT blob_it; //iterator
122 
123  STATS centre_to_centre_stats (0, MAXSPACING);
124  //DEBUG USE ONLY
125  STATS all_gap_stats (0, MAXSPACING);
126  STATS space_gap_stats (0, MAXSPACING);
127  inT16 minwidth = MAX_INT16; //narrowest blob
128  TBOX blob_box;
129  TBOX prev_blob_box;
130  inT16 centre_to_centre;
131  inT16 gap_width;
132  float real_space_threshold;
133  float iqr_centre_to_centre; //DEBUG USE ONLY
134  float iqr_all_gap_stats; //DEBUG USE ONLY
135  inT32 end_of_row;
136  inT32 row_length;
137 
138  row_it.set_to_list (block->get_rows ());
139  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
140  row = row_it.data ();
141  if (!row->blob_list ()->empty () &&
143  (row->pitch_decision == PITCH_DEF_PROP) ||
144  (row->pitch_decision == PITCH_CORR_PROP))) {
145  blob_it.set_to_list (row->blob_list ());
146  blob_it.mark_cycle_pt ();
147  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
149  blob_box = box_next_pre_chopped (&blob_it);
150  else if (tosp_stats_use_xht_gaps)
151  blob_box = reduced_box_next (row, &blob_it);
152  else
153  blob_box = box_next (&blob_it);
154  row_length = end_of_row - blob_box.left ();
155  if (blob_box.width () < minwidth)
156  minwidth = blob_box.width ();
157  prev_blob_box = blob_box;
158  while (!blob_it.cycled_list ()) {
160  blob_box = box_next_pre_chopped (&blob_it);
161  else if (tosp_stats_use_xht_gaps)
162  blob_box = reduced_box_next (row, &blob_it);
163  else
164  blob_box = box_next (&blob_it);
165  if (blob_box.width () < minwidth)
166  minwidth = blob_box.width ();
167  gap_width = blob_box.left () - prev_blob_box.right ();
168  if (!ignore_big_gap (row, row_length, gapmap,
169  prev_blob_box.right (), blob_box.left ())) {
170  all_gap_stats.add (gap_width, 1);
171 
172  centre_to_centre = (blob_box.left () + blob_box.right () -
173  (prev_blob_box.left () +
174  prev_blob_box.right ())) / 2;
175  //DEBUG
176  centre_to_centre_stats.add (centre_to_centre, 1);
177  // DEBUG
178  }
179  prev_blob_box = blob_box;
180  }
181  }
182  }
183 
184  //Inadequate samples
185  if (all_gap_stats.get_total () <= 1) {
186  block_non_space_gap_width = minwidth;
187  block_space_gap_width = -1; //No est. space width
188  //DEBUG
189  old_text_ord_proportional = TRUE;
190  }
191  else {
192  /* For debug only ..... */
193  iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
194  centre_to_centre_stats.ile (0.25);
195  iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
196  old_text_ord_proportional =
197  iqr_centre_to_centre * 2 > iqr_all_gap_stats;
198  /* .......For debug only */
199 
200  /*
201  The median of the gaps is used as an estimate of the NON-SPACE gap width.
202  This RELIES on the assumption that there are more gaps WITHIN words than
203  BETWEEN words in a block
204 
205  Now try to estimate the width of a real space for all real spaces in the
206  block. Do this by using a crude threshold to ignore "narrow" gaps, then
207  find the median of the "wide" gaps and use this.
208  */
209  block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
210  // median gap
211 
212  row_it.set_to_list (block->get_rows ());
213  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
214  row = row_it.data ();
215  if (!row->blob_list ()->empty () &&
217  (row->pitch_decision == PITCH_DEF_PROP) ||
218  (row->pitch_decision == PITCH_CORR_PROP))) {
219  real_space_threshold =
220  MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
222  blob_it.set_to_list (row->blob_list ());
223  blob_it.mark_cycle_pt ();
224  end_of_row =
225  blob_it.data_relative (-1)->bounding_box ().right ();
227  blob_box = box_next_pre_chopped (&blob_it);
228  else if (tosp_stats_use_xht_gaps)
229  blob_box = reduced_box_next (row, &blob_it);
230  else
231  blob_box = box_next (&blob_it);
232  row_length = blob_box.left () - end_of_row;
233  prev_blob_box = blob_box;
234  while (!blob_it.cycled_list ()) {
236  blob_box = box_next_pre_chopped (&blob_it);
237  else if (tosp_stats_use_xht_gaps)
238  blob_box = reduced_box_next (row, &blob_it);
239  else
240  blob_box = box_next (&blob_it);
241  gap_width = blob_box.left () - prev_blob_box.right ();
242  if ((gap_width > real_space_threshold) &&
243  !ignore_big_gap (row, row_length, gapmap,
244  prev_blob_box.right (),
245  blob_box.left ())) {
246  /*
247  If tosp_use_cert_spaces is enabled, the estimate of the space gap is
248  restricted to obvious spaces - those wider than half the xht or those
249  with wide blobs on both sides - i.e not things that are suspect 1's or
250  punctuation that is sometimes widely spaced.
251  */
253  (gap_width >
255  ||
256  ((gap_width >
259  || (!narrow_blob (row, prev_blob_box)
260  && !narrow_blob (row, blob_box))))
261  || (wide_blob (row, prev_blob_box)
262  && wide_blob (row, blob_box)))
263  space_gap_stats.add (gap_width, 1);
264  }
265  prev_blob_box = blob_box;
266  }
267  }
268  }
269  //Inadequate samples
270  if (space_gap_stats.get_total () <= 2)
271  block_space_gap_width = -1;//No est. space width
272  else
273  block_space_gap_width =
274  MAX ((inT16) floor (space_gap_stats.median ()),
275  3 * block_non_space_gap_width);
276  }
277 }
278 
279 
280 /*************************************************************************
281  * row_spacing_stats()
282  * Set values for min_space, max_non_space based on row stats only
283  * If failure - return 0 values.
284  *************************************************************************/
285 void Textord::row_spacing_stats(
286  TO_ROW *row,
287  GAPMAP *gapmap,
288  inT16 block_idx,
289  inT16 row_idx,
290  inT16 block_space_gap_width, //estimate for block
291  inT16 block_non_space_gap_width //estimate for block
292  ) {
293  //iterator
294  BLOBNBOX_IT blob_it = row->blob_list ();
295  STATS all_gap_stats (0, MAXSPACING);
296  STATS cert_space_gap_stats (0, MAXSPACING);
297  STATS all_space_gap_stats (0, MAXSPACING);
298  STATS small_gap_stats (0, MAXSPACING);
299  TBOX blob_box;
300  TBOX prev_blob_box;
301  inT16 gap_width;
302  inT16 real_space_threshold = 0;
303  inT16 max = 0;
304  inT16 index;
305  inT16 large_gap_count = 0;
306  BOOL8 suspected_table;
307  inT32 max_max_nonspace; //upper bound
308  BOOL8 good_block_space_estimate = block_space_gap_width > 0;
309  inT32 end_of_row;
310  inT32 row_length = 0;
311  float sane_space;
312  inT32 sane_threshold;
313 
314  /* Collect first pass stats for row */
315 
316  if (!good_block_space_estimate)
317  block_space_gap_width = inT16 (floor (row->xheight / 2));
318  if (!row->blob_list ()->empty ()) {
319  if (tosp_threshold_bias1 > 0)
320  real_space_threshold =
321  block_non_space_gap_width +
322  inT16 (floor (0.5 +
323  tosp_threshold_bias1 * (block_space_gap_width -
324  block_non_space_gap_width)));
325  else
326  real_space_threshold = //Old TO method
327  (block_space_gap_width + block_non_space_gap_width) / 2;
328  blob_it.set_to_list (row->blob_list ());
329  blob_it.mark_cycle_pt ();
330  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
332  blob_box = box_next_pre_chopped (&blob_it);
333  else if (tosp_stats_use_xht_gaps)
334  blob_box = reduced_box_next (row, &blob_it);
335  else
336  blob_box = box_next (&blob_it);
337  row_length = end_of_row - blob_box.left ();
338  prev_blob_box = blob_box;
339  while (!blob_it.cycled_list ()) {
341  blob_box = box_next_pre_chopped (&blob_it);
342  else if (tosp_stats_use_xht_gaps)
343  blob_box = reduced_box_next (row, &blob_it);
344  else
345  blob_box = box_next (&blob_it);
346  gap_width = blob_box.left () - prev_blob_box.right ();
347  if (ignore_big_gap (row, row_length, gapmap,
348  prev_blob_box.right (), blob_box.left ()))
349  large_gap_count++;
350  else {
351  if (gap_width >= real_space_threshold) {
353  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
354  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
356  || (!narrow_blob (row, prev_blob_box)
357  && !narrow_blob (row, blob_box))))
358  || (wide_blob (row, prev_blob_box)
359  && wide_blob (row, blob_box)))
360  cert_space_gap_stats.add (gap_width, 1);
361  all_space_gap_stats.add (gap_width, 1);
362  }
363  else
364  small_gap_stats.add (gap_width, 1);
365  all_gap_stats.add (gap_width, 1);
366  }
367  prev_blob_box = blob_box;
368  }
369  }
370  suspected_table = (large_gap_count > 1) ||
371  ((large_gap_count > 0) &&
372  (all_gap_stats.get_total () <= tosp_few_samples));
373 
374  /* Now determine row kern size, space size and threshold */
375 
376  if ((cert_space_gap_stats.get_total () >=
378  ((suspected_table ||
379  all_gap_stats.get_total () <= tosp_short_row) &&
380  cert_space_gap_stats.get_total () > 0)) {
381  old_to_method(row,
382  &all_gap_stats,
383  &cert_space_gap_stats,
384  &small_gap_stats,
385  block_space_gap_width,
386  block_non_space_gap_width);
387  } else {
389  !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
390  block_idx, row_idx)) {
392  tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
393  block_idx, row_idx);
394  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
395  //Use block default
396  row->space_size = block_space_gap_width;
397  if (all_gap_stats.get_total () > tosp_redo_kern_limit)
398  row->kern_size = all_gap_stats.median ();
399  else
400  row->kern_size = block_non_space_gap_width;
401  row->space_threshold =
402  inT32 (floor ((row->space_size + row->kern_size) /
404  }
405  else
406  old_to_method(row,
407  &all_gap_stats,
408  &all_space_gap_stats,
409  &small_gap_stats,
410  block_space_gap_width,
411  block_non_space_gap_width);
412  }
413  }
414 
415  if (tosp_improve_thresh && !suspected_table)
416  improve_row_threshold(row, &all_gap_stats);
417 
418  /* Now lets try to be careful not to do anything silly with tables when we
419  are ignoring big gaps*/
420  if (tosp_sanity_method == 0) {
421  if (suspected_table &&
422  (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
423  if (tosp_debug_level > 5)
424  tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
425  block_idx, row_idx,
426  row->kern_size, row->space_threshold, row->space_size);
427  row->space_threshold =
429  row->space_size = MAX (row->space_threshold + 1, row->xheight);
430  }
431  }
432  else if (tosp_sanity_method == 1) {
433  sane_space = row->space_size;
434  /* NEVER let space size get too close to kern size */
435  if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
436  || ((row->space_size - row->kern_size) <
437  (tosp_silly_kn_sp_gap * row->xheight))) {
438  if (good_block_space_estimate &&
439  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
440  sane_space = block_space_gap_width;
441  else
442  sane_space =
443  MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
444  row->xheight / 2);
445  if (tosp_debug_level > 5)
446  tprintf
447  ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
448  block_idx, row_idx, row->kern_size, row->space_threshold,
449  row->space_size, sane_space);
450  row->space_size = sane_space;
451  row->space_threshold =
452  inT32 (floor ((row->space_size + row->kern_size) /
454  }
455  /* NEVER let threshold get VERY far away from kern */
456  sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
457  MAX (row->kern_size, 2.5)));
458  if (row->space_threshold > sane_threshold) {
459  if (tosp_debug_level > 5)
460  tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
461  block_idx, row_idx,
462  row->kern_size,
463  row->space_threshold, row->space_size, sane_threshold);
464  row->space_threshold = sane_threshold;
465  if (row->space_size <= sane_threshold)
466  row->space_size = row->space_threshold + 1.0f;
467  }
468  /* Beware of tables - there may be NO spaces */
469  if (suspected_table) {
470  sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
472  sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
473 
474  if ((row->space_size < sane_space) ||
475  (row->space_threshold < sane_threshold)) {
476  if (tosp_debug_level > 5)
477  tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
478  block_idx, row_idx,
479  row->kern_size,
480  row->space_threshold, row->space_size);
481  //the minimum sane value
482  row->space_threshold = (inT32) sane_space;
483  row->space_size = MAX (row->space_threshold + 1, row->xheight);
484  }
485  }
486  }
487 
488  /* Now lets try to put some error limits on the threshold */
489 
490  if (tosp_old_to_method) {
491  /* Old textord made a space if gap >= threshold */
492  //NO FUZZY SPACES YET
493  row->max_nonspace = row->space_threshold;
494  //NO FUZZY SPACES YET
495  row->min_space = row->space_threshold + 1;
496  }
497  else {
498  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
499  row->min_space =
500  MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
501  inT32 (row->space_size));
502  if (row->min_space <= row->space_threshold)
503  //Dont be silly
504  row->min_space = row->space_threshold + 1;
505  /*
506  Lets try to guess the max certain kern gap by looking at the cluster of
507  kerns for the row. The row is proportional so the kerns should cluster
508  tightly at the bottom of the distribution. We also expect most gaps to be
509  kerns. Find the maximum of the kern piles between 0 and twice the kern
510  estimate. Piles before the first one with less than 1/10 the maximum
511  number of samples can be taken as certain kerns.
512 
513  Of course, there are some cases where the kern peak and space peaks merge,
514  so we will put an UPPER limit on the max certain kern gap of some fraction
515  below the threshold.
516  */
517 
518  max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
519 
520  //default
521  row->max_nonspace = max_max_nonspace;
522  for (index = 0; index <= max_max_nonspace; index++) {
523  if (all_gap_stats.pile_count (index) > max)
524  max = all_gap_stats.pile_count (index);
525  if ((index > row->kern_size) &&
526  (all_gap_stats.pile_count (index) < 0.1 * max)) {
527  row->max_nonspace = index;
528  break;
529  }
530  }
531  }
532 
533  /* Yet another algorithm - simpler this time - just choose a fraction of the
534  threshold to space range */
535 
536  if ((tosp_fuzzy_sp_fraction > 0) &&
537  (row->space_size > row->space_threshold))
538  row->min_space = MAX (row->min_space,
539  (inT32) ceil (row->space_threshold +
541  (row->space_size -
542  row->space_threshold)));
543 
544  /* Ensure that ANY space less than some multiplier times the kern size is
545  fuzzy. In tables there is a risk of erroneously setting a small space size
546  when there are no real spaces. Sometimes tables have text squashed into
547  columns so that the kn->sp ratio is small anyway - this means that we cant
548  use this to force a wider separation - hence we rely on context to join any
549  dubious breaks. */
550 
551  if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
552  (suspected_table || tosp_fuzzy_limit_all))
553  row->min_space = MAX (row->min_space,
555  row->kern_size));
556 
557  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
558  row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
560  (row->space_threshold -
561  row->kern_size));
562  }
563  if (row->max_nonspace > row->space_threshold) {
564  //Dont be silly
565  row->max_nonspace = row->space_threshold;
566  }
567 
568  if (tosp_debug_level > 5)
569  tprintf
570  ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
571  block_idx, row_idx, row_length, block_non_space_gap_width,
572  block_space_gap_width, real_space_threshold, row->kern_size,
573  row->max_nonspace, row->space_threshold, row->min_space,
574  row->space_size);
575  if (tosp_debug_level > 10)
576  tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
577  "row->space_threshold = %d\n",
578  row->kern_size, row->space_size, row->space_threshold);
579 }
580 
581 void Textord::old_to_method(
582  TO_ROW *row,
583  STATS *all_gap_stats,
584  STATS *space_gap_stats,
585  STATS *small_gap_stats,
586  inT16 block_space_gap_width, //estimate for block
587  inT16 block_non_space_gap_width //estimate for block
588  ) {
589  /* First, estimate row space size */
590  /* Old to condition was > 2 */
591  if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
592  //Adequate samples
593  /* Set space size to median of spaces BUT limits it if it seems wildly out */
594  row->space_size = space_gap_stats->median ();
595  if (row->space_size > block_space_gap_width * 1.5) {
597  row->space_size = block_space_gap_width * 1.5;
598  else
599  //BUG??? should be *1.5
600  row->space_size = block_space_gap_width;
601  }
602  if (row->space_size < (block_non_space_gap_width * 2) + 1)
603  row->space_size = (block_non_space_gap_width * 2) + 1;
604  }
605  //Only 1 or 2 samples
606  else if (space_gap_stats->get_total () >= 1) {
607  //hence mean not median
608  row->space_size = space_gap_stats->mean ();
609  if (row->space_size > block_space_gap_width * 1.5) {
611  row->space_size = block_space_gap_width * 1.5;
612  else
613  //BUG??? should be *1.5
614  row->space_size = block_space_gap_width;
615  }
616  if (row->space_size < (block_non_space_gap_width * 3) + 1)
617  row->space_size = (block_non_space_gap_width * 3) + 1;
618  }
619  else {
620  //Use block default
621  row->space_size = block_space_gap_width;
622  }
623 
624  /* Next, estimate row kern size */
626  (small_gap_stats->get_total () > tosp_redo_kern_limit))
627  row->kern_size = small_gap_stats->median ();
628  else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
629  row->kern_size = all_gap_stats->median ();
630  else //old TO -SAME FOR ALL ROWS
631  row->kern_size = block_non_space_gap_width;
632 
633  /* Finally, estimate row space threshold */
634  if (tosp_threshold_bias2 > 0) {
635  row->space_threshold =
636  inT32 (floor (0.5 + row->kern_size +
638  row->kern_size)));
639  } else {
640  /*
641  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
642  and holds this in a float. The use is with a >= test
643  NEW textord uses an integer threshold and a > test
644  It comes to the same thing.
645  (Though there is a difference in that old textor has integer space_size
646  and kern_size.)
647  */
648  row->space_threshold =
649  inT32 (floor ((row->space_size + row->kern_size) / 2));
650  }
651 
652  // Apply the same logic and ratios as in row_spacing_stats to
653  // restrict relative values of the row's space_size, kern_size, and
654  // space_threshold
656  ((row->space_size <
657  tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) ||
658  ((row->space_size - row->kern_size) <
659  tosp_silly_kn_sp_gap * row->xheight))) {
660  if (row->kern_size > 2.5)
662  row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) /
664  }
665 }
666 
667 
668 /*************************************************************************
669  * isolated_row_stats()
670  * Set values for min_space, max_non_space based on row stats only
671  *************************************************************************/
672 BOOL8 Textord::isolated_row_stats(TO_ROW *row,
673  GAPMAP *gapmap,
674  STATS *all_gap_stats,
675  BOOL8 suspected_table,
676  inT16 block_idx,
677  inT16 row_idx) {
678  float kern_estimate;
679  float crude_threshold_estimate;
680  inT16 small_gaps_count;
681  inT16 total;
682  //iterator
683  BLOBNBOX_IT blob_it = row->blob_list ();
684  STATS cert_space_gap_stats (0, MAXSPACING);
685  STATS all_space_gap_stats (0, MAXSPACING);
686  STATS small_gap_stats (0, MAXSPACING);
687  TBOX blob_box;
688  TBOX prev_blob_box;
689  inT16 gap_width;
690  inT32 end_of_row;
691  inT32 row_length;
692 
693  kern_estimate = all_gap_stats->median ();
694  crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
696  small_gaps_count = stats_count_under (all_gap_stats,
697  (inT16)
698  ceil (crude_threshold_estimate));
699  total = all_gap_stats->get_total ();
700 
701  if ((total <= tosp_redo_kern_limit) ||
702  ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
703  (total - small_gaps_count < 1)) {
704  if (tosp_debug_level > 5)
705  tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
706  block_idx, row_idx);
707  return FALSE;
708  }
709  blob_it.set_to_list (row->blob_list ());
710  blob_it.mark_cycle_pt ();
711  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
713  blob_box = box_next_pre_chopped (&blob_it);
714  else if (tosp_stats_use_xht_gaps)
715  blob_box = reduced_box_next (row, &blob_it);
716  else
717  blob_box = box_next (&blob_it);
718  row_length = end_of_row - blob_box.left ();
719  prev_blob_box = blob_box;
720  while (!blob_it.cycled_list ()) {
722  blob_box = box_next_pre_chopped (&blob_it);
723  else if (tosp_stats_use_xht_gaps)
724  blob_box = reduced_box_next (row, &blob_it);
725  else
726  blob_box = box_next (&blob_it);
727  gap_width = blob_box.left () - prev_blob_box.right ();
728  if (!ignore_big_gap (row, row_length, gapmap,
729  prev_blob_box.right (), blob_box.left ()) &&
730  (gap_width > crude_threshold_estimate)) {
731  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
732  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
734  (!narrow_blob (row, prev_blob_box) &&
735  !narrow_blob (row, blob_box)))) ||
736  (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
737  cert_space_gap_stats.add (gap_width, 1);
738  all_space_gap_stats.add (gap_width, 1);
739  }
740  if (gap_width < crude_threshold_estimate)
741  small_gap_stats.add (gap_width, 1);
742 
743  prev_blob_box = blob_box;
744  }
745  if (cert_space_gap_stats.get_total () >=
747  //median
748  row->space_size = cert_space_gap_stats.median ();
749  else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
750  //to avoid spaced
751  row->space_size = cert_space_gap_stats.mean ();
752  // 1's in tables
753  else if (all_space_gap_stats.get_total () >=
755  //median
756  row->space_size = all_space_gap_stats.median ();
757  else
758  row->space_size = all_space_gap_stats.mean ();
759 
761  row->kern_size = small_gap_stats.median ();
762  else
763  row->kern_size = all_gap_stats->median ();
764  row->space_threshold =
765  inT32 (floor ((row->space_size + row->kern_size) / 2));
766  /* Sanity check */
767  if ((row->kern_size >= row->space_threshold) ||
768  (row->space_threshold >= row->space_size) ||
769  (row->space_threshold <= 0)) {
770  if (tosp_debug_level > 5)
771  tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
772  block_idx, row_idx,
773  row->kern_size, row->space_threshold, row->space_size);
774  row->kern_size = 0.0f;
775  row->space_threshold = 0;
776  row->space_size = 0.0f;
777  return FALSE;
778  }
779 
780  if (tosp_debug_level > 5)
781  tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
782  block_idx, row_idx,
783  row->kern_size, row->space_threshold, row->space_size);
784  return TRUE;
785 }
786 
787 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) {
788  inT16 index;
789  inT16 total = 0;
790 
791  for (index = 0; index < threshold; index++)
792  total += stats->pile_count (index);
793  return total;
794 }
795 
796 
797 /*************************************************************************
798  * improve_row_threshold()
799  * Try to recognise a "normal line" -
800  * > 25 gaps
801  * && space > 3 * kn && space > 10
802  * (I.e. reasonably large space and kn:sp ratio)
803  * && > 3/4 # gaps < kn + (sp - kn)/3
804  * (I.e. most gaps are well away from space estimate)
805  * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
806  * somewhere in the histogram between kn and sp
807  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
808  * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
809  * try moving the default threshold to within this band but leave the
810  * fuzzy limit calculation as at present.
811  *************************************************************************/
812 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
813  float sp = row->space_size;
814  float kn = row->kern_size;
815  inT16 reqd_zero_width = 0;
816  inT16 zero_width = 0;
817  inT16 zero_start = 0;
818  inT16 index = 0;
819 
820  if (tosp_debug_level > 10)
821  tprintf ("Improve row threshold 0");
822  if ((all_gap_stats->get_total () <= 25) ||
823  (sp <= 10) ||
824  (sp <= 3 * kn) ||
825  (stats_count_under (all_gap_stats,
826  (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
827  (0.75 * all_gap_stats->get_total ())))
828  return;
829  if (tosp_debug_level > 10)
830  tprintf (" 1");
831  /*
832  Look for the first region of all 0's in the histogram which is wider than
833  max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
834  threshold is not within it, move the threshold so that is is just inside it.
835  */
836  reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
837  if (reqd_zero_width < 3)
838  reqd_zero_width = 3;
839 
840  for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
841  if (all_gap_stats->pile_count (index) == 0) {
842  if (zero_width == 0)
843  zero_start = index;
844  zero_width++;
845  }
846  else {
847  if (zero_width >= reqd_zero_width)
848  break;
849  else {
850  zero_width = 0;
851  }
852  }
853  }
854  index--;
855  if (tosp_debug_level > 10)
856  tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
857  reqd_zero_width, zero_width, zero_start, row->space_threshold);
858  if ((zero_width < reqd_zero_width) ||
859  ((row->space_threshold >= zero_start) &&
860  (row->space_threshold <= index)))
861  return;
862  if (tosp_debug_level > 10)
863  tprintf (" 2");
864  if (row->space_threshold < zero_start) {
865  if (tosp_debug_level > 5)
866  tprintf
867  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
868  kn, sp, zero_start, index, row->space_threshold, zero_start);
869  row->space_threshold = zero_start;
870  }
871  if (row->space_threshold > index) {
872  if (tosp_debug_level > 5)
873  tprintf
874  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
875  kn, sp, zero_start, index, row->space_threshold, index);
876  row->space_threshold = index;
877  }
878 }
879 
880 
881 /**********************************************************************
882  * make_prop_words
883  *
884  * Convert a TO_BLOCK to a BLOCK.
885  **********************************************************************/
887  TO_ROW *row, // row to make
888  FCOORD rotation // for drawing
889  ) {
890  BOOL8 bol; //start of line
891  /* prev_ values are for start of word being built. non prev_ values are for
892  the gap between the word being built and the next one. */
893  BOOL8 prev_fuzzy_sp; //probably space
894  BOOL8 prev_fuzzy_non; //probably not
895  uinT8 prev_blanks; //in front of word
896  BOOL8 fuzzy_sp; //probably space
897  BOOL8 fuzzy_non; //probably not
898  uinT8 blanks; //in front of word
899  BOOL8 prev_gap_was_a_space = FALSE;
900  BOOL8 break_at_next_gap = FALSE;
901  ROW *real_row; //output row
902  C_OUTLINE_IT cout_it;
903  C_BLOB_LIST cblobs;
904  C_BLOB_IT cblob_it = &cblobs;
905  WERD_LIST words;
906  WERD_IT word_it; //new words
907  WERD *word; //new word
908  WERD_IT rep_char_it; //repeated char words
909  inT32 next_rep_char_word_right = MAX_INT32;
910  float repetition_spacing; //gap between repetitions
911  inT32 xstarts[2]; //row ends
912  double coeffs[3]; //quadratic
913  inT32 prev_x; //end of prev blob
914  BLOBNBOX *bblob; //current blob
915  TBOX blob_box; //bounding box
916  BLOBNBOX_IT box_it; //iterator
917  TBOX prev_blob_box;
918  TBOX next_blob_box;
919  inT16 prev_gap = MAX_INT16;
920  inT16 current_gap = MAX_INT16;
921  inT16 next_gap = MAX_INT16;
922  inT16 prev_within_xht_gap = MAX_INT16;
923  inT16 current_within_xht_gap = MAX_INT16;
924  inT16 next_within_xht_gap = MAX_INT16;
925  inT16 word_count = 0;
926 
927  rep_char_it.set_to_list (&(row->rep_words));
928  if (!rep_char_it.empty ()) {
929  next_rep_char_word_right =
930  rep_char_it.data ()->bounding_box ().right ();
931  }
932 
933  prev_x = -MAX_INT16;
934  cblob_it.set_to_list (&cblobs);
935  box_it.set_to_list (row->blob_list ());
936  word_it.set_to_list (&words);
937  bol = TRUE;
938  prev_blanks = 0;
939  prev_fuzzy_sp = FALSE;
940  prev_fuzzy_non = FALSE;
941  if (!box_it.empty ()) {
942  xstarts[0] = box_it.data ()->bounding_box ().left ();
943  if (xstarts[0] > next_rep_char_word_right) {
944  /* We need to insert a repeated char word at the start of the row */
945  word = rep_char_it.extract ();
946  word_it.add_after_then_move (word);
947  /* Set spaces before repeated char word */
948  word->set_flag (W_BOL, TRUE);
949  bol = FALSE;
950  word->set_blanks (0);
951  //NO uncertainty
952  word->set_flag (W_FUZZY_SP, FALSE);
953  word->set_flag (W_FUZZY_NON, FALSE);
954  xstarts[0] = word->bounding_box ().left ();
955  /* Set spaces after repeated char word (and leave current word set) */
956  repetition_spacing = find_mean_blob_spacing (word);
957  current_gap = box_it.data ()->bounding_box ().left () -
958  next_rep_char_word_right;
959  current_within_xht_gap = current_gap;
960  if (current_gap > tosp_rep_space * repetition_spacing) {
961  prev_blanks = (uinT8) floor (current_gap / row->space_size);
962  if (prev_blanks < 1)
963  prev_blanks = 1;
964  }
965  else
966  prev_blanks = 0;
967  if (tosp_debug_level > 5)
968  tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
969  box_it.data ()->bounding_box ().left (),
970  box_it.data ()->bounding_box ().bottom (),
971  repetition_spacing, current_gap);
972  prev_fuzzy_sp = FALSE;
973  prev_fuzzy_non = FALSE;
974  if (rep_char_it.empty ()) {
975  next_rep_char_word_right = MAX_INT32;
976  }
977  else {
978  rep_char_it.forward ();
979  next_rep_char_word_right =
980  rep_char_it.data ()->bounding_box ().right ();
981  }
982  }
983 
984  peek_at_next_gap(row,
985  box_it,
986  next_blob_box,
987  next_gap,
988  next_within_xht_gap);
989  do {
990  bblob = box_it.data ();
991  blob_box = bblob->bounding_box ();
992  if (bblob->joined_to_prev ()) {
993  if (bblob->cblob () != NULL) {
994  cout_it.set_to_list (cblob_it.data ()->out_list ());
995  cout_it.move_to_last ();
996  cout_it.add_list_after (bblob->cblob ()->out_list ());
997  delete bblob->cblob ();
998  }
999  } else {
1000  if (bblob->cblob() != NULL)
1001  cblob_it.add_after_then_move (bblob->cblob ());
1002  prev_x = blob_box.right ();
1003  }
1004  box_it.forward (); //next one
1005  bblob = box_it.data ();
1006  blob_box = bblob->bounding_box ();
1007 
1008  if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
1009  /* Real Blob - not multiple outlines or pre-chopped */
1010  prev_gap = current_gap;
1011  prev_within_xht_gap = current_within_xht_gap;
1012  prev_blob_box = next_blob_box;
1013  current_gap = next_gap;
1014  current_within_xht_gap = next_within_xht_gap;
1015  peek_at_next_gap(row,
1016  box_it,
1017  next_blob_box,
1018  next_gap,
1019  next_within_xht_gap);
1020 
1021  inT16 prev_gap_arg = prev_gap;
1022  inT16 next_gap_arg = next_gap;
1023  if (tosp_only_use_xht_gaps) {
1024  prev_gap_arg = prev_within_xht_gap;
1025  next_gap_arg = next_within_xht_gap;
1026  }
1027  // Decide if a word-break should be inserted
1028  if (blob_box.left () > next_rep_char_word_right ||
1029  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1030  current_gap, current_within_xht_gap,
1031  next_blob_box, next_gap_arg,
1032  blanks, fuzzy_sp, fuzzy_non,
1033  prev_gap_was_a_space,
1034  break_at_next_gap) ||
1035  box_it.at_first()) {
1036  /* Form a new word out of the blobs collected */
1037  word = new WERD (&cblobs, prev_blanks, NULL);
1038  word_count++;
1039  word_it.add_after_then_move (word);
1040  if (bol) {
1041  word->set_flag (W_BOL, TRUE);
1042  bol = FALSE;
1043  }
1044  if (prev_fuzzy_sp)
1045  //probably space
1046  word->set_flag (W_FUZZY_SP, TRUE);
1047  else if (prev_fuzzy_non)
1048  word->set_flag (W_FUZZY_NON, TRUE);
1049  //probably not
1050 
1051  if (blob_box.left () > next_rep_char_word_right) {
1052  /* We need to insert a repeated char word */
1053  word = rep_char_it.extract ();
1054  word_it.add_after_then_move (word);
1055 
1056  /* Set spaces before repeated char word */
1057  repetition_spacing = find_mean_blob_spacing (word);
1058  current_gap = word->bounding_box ().left () - prev_x;
1059  current_within_xht_gap = current_gap;
1060  if (current_gap > tosp_rep_space * repetition_spacing) {
1061  blanks =
1062  (uinT8) floor (current_gap / row->space_size);
1063  if (blanks < 1)
1064  blanks = 1;
1065  }
1066  else
1067  blanks = 0;
1068  if (tosp_debug_level > 5)
1069  tprintf
1070  ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1071  word->bounding_box ().left (),
1072  word->bounding_box ().bottom (),
1073  repetition_spacing, current_gap, blanks);
1074  word->set_blanks (blanks);
1075  //NO uncertainty
1076  word->set_flag (W_FUZZY_SP, FALSE);
1077  word->set_flag (W_FUZZY_NON, FALSE);
1078 
1079  /* Set spaces after repeated char word (and leave current word set) */
1080  current_gap =
1081  blob_box.left () - next_rep_char_word_right;
1082  if (current_gap > tosp_rep_space * repetition_spacing) {
1083  blanks = (uinT8) (current_gap / row->space_size);
1084  if (blanks < 1)
1085  blanks = 1;
1086  }
1087  else
1088  blanks = 0;
1089  if (tosp_debug_level > 5)
1090  tprintf (" Rgap:%d (%d blanks)\n",
1091  current_gap, blanks);
1092  fuzzy_sp = FALSE;
1093  fuzzy_non = FALSE;
1094 
1095  if (rep_char_it.empty ()) {
1096  next_rep_char_word_right = MAX_INT32;
1097  }
1098  else {
1099  rep_char_it.forward ();
1100  next_rep_char_word_right =
1101  rep_char_it.data ()->bounding_box ().right ();
1102  }
1103  }
1104 
1105  if (box_it.at_first () && rep_char_it.empty ()) {
1106  //at end of line
1107  word->set_flag (W_EOL, TRUE);
1108  xstarts[1] = prev_x;
1109  }
1110  else {
1111  prev_blanks = blanks;
1112  prev_fuzzy_sp = fuzzy_sp;
1113  prev_fuzzy_non = fuzzy_non;
1114  }
1115  }
1116  }
1117  }
1118  while (!box_it.at_first ()); //until back at start
1119 
1120  /* Insert any further repeated char words */
1121  while (!rep_char_it.empty ()) {
1122  word = rep_char_it.extract ();
1123  word_it.add_after_then_move (word);
1124 
1125  /* Set spaces before repeated char word */
1126  repetition_spacing = find_mean_blob_spacing (word);
1127  current_gap = word->bounding_box ().left () - prev_x;
1128  if (current_gap > tosp_rep_space * repetition_spacing) {
1129  blanks = (uinT8) floor (current_gap / row->space_size);
1130  if (blanks < 1)
1131  blanks = 1;
1132  }
1133  else
1134  blanks = 0;
1135  if (tosp_debug_level > 5)
1136  tprintf
1137  ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
1138  word->bounding_box ().left (), word->bounding_box ().bottom (),
1139  repetition_spacing, current_gap, blanks);
1140  word->set_blanks (blanks);
1141  //NO uncertainty
1142  word->set_flag (W_FUZZY_SP, FALSE);
1143  word->set_flag (W_FUZZY_NON, FALSE);
1144  prev_x = word->bounding_box ().right ();
1145  if (rep_char_it.empty ()) {
1146  //at end of line
1147  word->set_flag (W_EOL, TRUE);
1148  xstarts[1] = prev_x;
1149  }
1150  else {
1151  rep_char_it.forward ();
1152  }
1153  }
1154  coeffs[0] = 0;
1155  coeffs[1] = row->line_m ();
1156  coeffs[2] = row->line_c ();
1157  real_row = new ROW (row,
1158  (inT16) row->kern_size, (inT16) row->space_size);
1159  word_it.set_to_list (real_row->word_list ());
1160  //put words in row
1161  word_it.add_list_after (&words);
1162  real_row->recalc_bounding_box ();
1163 
1164  if (tosp_debug_level > 4) {
1165  tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1166  word_count,
1167  real_row->bounding_box ().left (),
1168  real_row->bounding_box ().bottom (),
1169  real_row->bounding_box ().right (),
1170  real_row->bounding_box ().top ());
1171  }
1172  return real_row;
1173  }
1174  return NULL;
1175 }
1176 
1177 /**********************************************************************
1178  * make_blob_words
1179  *
1180  * Converts words into blobs so that each blob is a single character.
1181  * Used for chopper test.
1182  **********************************************************************/
1184  TO_ROW *row, // row to make
1185  FCOORD rotation // for drawing
1186  ) {
1187  bool bol; // start of line
1188  ROW *real_row; // output row
1189  C_OUTLINE_IT cout_it;
1190  C_BLOB_LIST cblobs;
1191  C_BLOB_IT cblob_it = &cblobs;
1192  WERD_LIST words;
1193  WERD_IT word_it; // new words
1194  WERD *word; // new word
1195  double coeffs[3]; // quadratic
1196  BLOBNBOX *bblob; // current blob
1197  TBOX blob_box; // bounding box
1198  BLOBNBOX_IT box_it; // iterator
1199  inT16 word_count = 0;
1200 
1201  cblob_it.set_to_list(&cblobs);
1202  box_it.set_to_list(row->blob_list());
1203  word_it.set_to_list(&words);
1204  bol = TRUE;
1205  if (!box_it.empty()) {
1206 
1207  do {
1208  bblob = box_it.data();
1209  blob_box = bblob->bounding_box();
1210  if (bblob->joined_to_prev()) {
1211  if (bblob->cblob() != NULL) {
1212  cout_it.set_to_list(cblob_it.data()->out_list());
1213  cout_it.move_to_last();
1214  cout_it.add_list_after(bblob->cblob()->out_list());
1215  delete bblob->cblob();
1216  }
1217  } else {
1218  if (bblob->cblob() != NULL)
1219  cblob_it.add_after_then_move(bblob->cblob());
1220  }
1221  box_it.forward(); // next one
1222  bblob = box_it.data();
1223  blob_box = bblob->bounding_box();
1224 
1225  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1226  word = new WERD(&cblobs, 1, NULL);
1227  word_count++;
1228  word_it.add_after_then_move(word);
1229  if (bol) {
1230  word->set_flag(W_BOL, TRUE);
1231  bol = FALSE;
1232  }
1233  if (box_it.at_first()) { // at end of line
1234  word->set_flag(W_EOL, TRUE);
1235  }
1236  }
1237  }
1238  while (!box_it.at_first()); // until back at start
1239  /* Setup the row with created words. */
1240  coeffs[0] = 0;
1241  coeffs[1] = row->line_m();
1242  coeffs[2] = row->line_c();
1243  real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
1244  word_it.set_to_list(real_row->word_list());
1245  //put words in row
1246  word_it.add_list_after(&words);
1247  real_row->recalc_bounding_box();
1248  if (tosp_debug_level > 4) {
1249  tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1250  word_count,
1251  real_row->bounding_box().left(),
1252  real_row->bounding_box().bottom(),
1253  real_row->bounding_box().right(),
1254  real_row->bounding_box().top());
1255  }
1256  return real_row;
1257  }
1258  return NULL;
1259 }
1260 
1261 BOOL8 Textord::make_a_word_break(
1262  TO_ROW *row, // row being made
1263  TBOX blob_box, // for next_blob // how many blanks?
1264  inT16 prev_gap,
1265  TBOX prev_blob_box,
1266  inT16 real_current_gap,
1267  inT16 within_xht_current_gap,
1268  TBOX next_blob_box,
1269  inT16 next_gap,
1270  uinT8 &blanks,
1271  BOOL8 &fuzzy_sp,
1272  BOOL8 &fuzzy_non,
1273  BOOL8& prev_gap_was_a_space,
1274  BOOL8& break_at_next_gap) {
1275  BOOL8 space;
1276  inT16 current_gap;
1277  float fuzzy_sp_to_kn_limit;
1278 
1279  if (break_at_next_gap) {
1280  break_at_next_gap = FALSE;
1281  return TRUE;
1282  }
1283  /* Inhibit using the reduced gap if
1284  The kerning is large - chars are not kerned and reducing "f"s can cause
1285  erroneous blanks
1286  OR The real gap is less than 0
1287  OR The real gap is less than the kerning estimate
1288  */
1289  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1291  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1292  //Ignore the difference
1293  within_xht_current_gap = real_current_gap;
1294 
1296  current_gap = within_xht_current_gap;
1297  else
1298  current_gap = real_current_gap;
1299 
1300  if (tosp_old_to_method) {
1301  //Boring old method
1302  space = current_gap > row->max_nonspace;
1303  if (space && (current_gap < MAX_INT16)) {
1304  if (current_gap < row->min_space) {
1305  if (current_gap > row->space_threshold) {
1306  blanks = 1;
1307  fuzzy_sp = TRUE;
1308  fuzzy_non = FALSE;
1309  }
1310  else {
1311  blanks = 0;
1312  fuzzy_sp = FALSE;
1313  fuzzy_non = TRUE;
1314  }
1315  }
1316  else {
1317  blanks = (uinT8) (current_gap / row->space_size);
1318  if (blanks < 1)
1319  blanks = 1;
1320  fuzzy_sp = FALSE;
1321  fuzzy_non = FALSE;
1322  }
1323  }
1324  return space;
1325  }
1326  else {
1327  /* New exciting heuristic method */
1328  if (prev_blob_box.null_box ()) // Beginning of row
1329  prev_gap_was_a_space = TRUE;
1330 
1331  //Default as old TO
1332  space = current_gap > row->space_threshold;
1333 
1334  /* Set defaults for the word break incase we find one. Currently there are
1335  no fuzzy spaces. Depending on the reliability of the different heuristics
1336  we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1337  be used if the function returns TRUE - ie the word is to be broken.
1338  */
1339  blanks = (uinT8) (current_gap / row->space_size);
1340  if (blanks < 1)
1341  blanks = 1;
1342  fuzzy_sp = FALSE;
1343  fuzzy_non = FALSE;
1344  /*
1345  If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1346  despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1347  context.
1348  */
1349  if (tosp_use_xht_gaps &&
1350  (real_current_gap <= row->max_nonspace) &&
1351  (within_xht_current_gap > row->max_nonspace)) {
1352  space = TRUE;
1353  fuzzy_non = TRUE;
1354 #ifndef GRAPHICS_DISABLED
1355  mark_gap (blob_box, 20,
1356  prev_gap, prev_blob_box.width (),
1357  current_gap, next_blob_box.width (), next_gap);
1358 #endif
1359  }
1360  else if (tosp_use_xht_gaps &&
1361  (real_current_gap <= row->space_threshold) &&
1362  (within_xht_current_gap > row->space_threshold)) {
1363  space = TRUE;
1365  fuzzy_sp = TRUE;
1366  else
1367  fuzzy_non = TRUE;
1368 #ifndef GRAPHICS_DISABLED
1369  mark_gap (blob_box, 21,
1370  prev_gap, prev_blob_box.width (),
1371  current_gap, next_blob_box.width (), next_gap);
1372 #endif
1373  }
1374  else if (tosp_use_xht_gaps &&
1375  (real_current_gap < row->min_space) &&
1376  (within_xht_current_gap >= row->min_space)) {
1377  space = TRUE;
1378 #ifndef GRAPHICS_DISABLED
1379  mark_gap (blob_box, 22,
1380  prev_gap, prev_blob_box.width (),
1381  current_gap, next_blob_box.width (), next_gap);
1382 #endif
1383  }
1384  else if (tosp_force_wordbreak_on_punct &&
1385  !suspected_punct_blob(row, prev_blob_box) &&
1386  suspected_punct_blob(row, blob_box)) {
1387  break_at_next_gap = TRUE;
1388  }
1389  /* Now continue with normal heuristics */
1390  else if ((current_gap < row->min_space) &&
1391  (current_gap > row->space_threshold)) {
1392  /* Heuristics to turn dubious spaces to kerns */
1394  fuzzy_sp_to_kn_limit = row->kern_size +
1396  (row->space_size - row->kern_size);
1397  else
1398  fuzzy_sp_to_kn_limit = 99999.0f;
1399 
1400  /* If current gap is significantly smaller than the previous space the other
1401  side of a narrow blob then this gap is a kern. */
1402  if ((prev_blob_box.width () > 0) &&
1403  narrow_blob (row, prev_blob_box) &&
1404  prev_gap_was_a_space &&
1405  (current_gap <= tosp_gap_factor * prev_gap)) {
1406  if ((tosp_all_flips_fuzzy) ||
1407  (current_gap > fuzzy_sp_to_kn_limit)) {
1409  fuzzy_non = TRUE;
1410  else
1411  fuzzy_sp = TRUE;
1412  }
1413  else
1414  space = FALSE;
1415 #ifndef GRAPHICS_DISABLED
1416  mark_gap (blob_box, 1,
1417  prev_gap, prev_blob_box.width (),
1418  current_gap, next_blob_box.width (), next_gap);
1419 #endif
1420  }
1421  /* If current gap not much bigger than the previous kern the other side of a
1422  narrow blob then this gap is a kern as well */
1423  else if ((prev_blob_box.width () > 0) &&
1424  narrow_blob (row, prev_blob_box) &&
1425  !prev_gap_was_a_space &&
1426  (current_gap * tosp_gap_factor <= prev_gap)) {
1427  if ((tosp_all_flips_fuzzy) ||
1428  (current_gap > fuzzy_sp_to_kn_limit)) {
1430  fuzzy_non = TRUE;
1431  else
1432  fuzzy_sp = TRUE;
1433  }
1434  else
1435  space = FALSE;
1436 #ifndef GRAPHICS_DISABLED
1437  mark_gap (blob_box, 2,
1438  prev_gap, prev_blob_box.width (),
1439  current_gap, next_blob_box.width (), next_gap);
1440 #endif
1441  }
1442  else if ((next_blob_box.width () > 0) &&
1443  narrow_blob (row, next_blob_box) &&
1444  (next_gap > row->space_threshold) &&
1445  (current_gap <= tosp_gap_factor * next_gap)) {
1446  if ((tosp_all_flips_fuzzy) ||
1447  (current_gap > fuzzy_sp_to_kn_limit)) {
1449  fuzzy_non = TRUE;
1450  else
1451  fuzzy_sp = TRUE;
1452  }
1453  else
1454  space = FALSE;
1455 #ifndef GRAPHICS_DISABLED
1456  mark_gap (blob_box, 3,
1457  prev_gap, prev_blob_box.width (),
1458  current_gap, next_blob_box.width (), next_gap);
1459 #endif
1460  }
1461  else if ((next_blob_box.width () > 0) &&
1462  narrow_blob (row, next_blob_box) &&
1463  (next_gap <= row->space_threshold) &&
1464  (current_gap * tosp_gap_factor <= next_gap)) {
1465  if ((tosp_all_flips_fuzzy) ||
1466  (current_gap > fuzzy_sp_to_kn_limit)) {
1468  fuzzy_non = TRUE;
1469  else
1470  fuzzy_sp = TRUE;
1471  }
1472  else
1473  space = FALSE;
1474 #ifndef GRAPHICS_DISABLED
1475  mark_gap (blob_box, 4,
1476  prev_gap, prev_blob_box.width (),
1477  current_gap, next_blob_box.width (), next_gap);
1478 #endif
1479  }
1480  else if ((((next_blob_box.width () > 0) &&
1481  narrow_blob (row, next_blob_box)) ||
1482  ((prev_blob_box.width () > 0) &&
1483  narrow_blob (row, prev_blob_box)))) {
1484  fuzzy_sp = TRUE;
1485 #ifndef GRAPHICS_DISABLED
1486  mark_gap (blob_box, 6,
1487  prev_gap, prev_blob_box.width (),
1488  current_gap, next_blob_box.width (), next_gap);
1489 #endif
1490  }
1491  }
1492  else if ((current_gap > row->max_nonspace) &&
1493  (current_gap <= row->space_threshold)) {
1494 
1495  /* Heuristics to turn dubious kerns to spaces */
1496  /* TRIED THIS BUT IT MADE THINGS WORSE
1497  if ( prev_gap == MAX_INT16 )
1498  prev_gap = 0; // start of row
1499  if ( next_gap == MAX_INT16 )
1500  next_gap = 0; // end of row
1501  */
1502  if ((prev_blob_box.width () > 0) &&
1503  (next_blob_box.width () > 0) &&
1504  (current_gap >=
1505  tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
1506  wide_blob (row, prev_blob_box) &&
1507  wide_blob (row, next_blob_box)) {
1508 
1509  space = TRUE;
1510  /*
1511  tosp_flip_caution is an attempt to stop the default changing in cases
1512  where there is a large difference between the kern and space estimates.
1513  See problem in 'chiefs' where "have" gets split in the quotation.
1514  */
1515  if ((tosp_flip_fuzz_kn_to_sp) &&
1516  ((tosp_flip_caution <= 0) ||
1517  (tosp_flip_caution * row->kern_size > row->space_size)))
1518  fuzzy_sp = TRUE;
1519  else
1520  fuzzy_non = TRUE;
1521 #ifndef GRAPHICS_DISABLED
1522  mark_gap (blob_box, 7,
1523  prev_gap, prev_blob_box.width (),
1524  current_gap, next_blob_box.width (), next_gap);
1525 #endif
1526  } else if (prev_blob_box.width() > 0 &&
1527  next_blob_box.width() > 0 &&
1528  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1529  current_gap >=
1530  tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) &&
1531  !(narrow_blob(row, prev_blob_box) ||
1532  suspected_punct_blob(row, prev_blob_box)) &&
1533  !(narrow_blob(row, next_blob_box) ||
1534  suspected_punct_blob(row, next_blob_box))) {
1535  space = TRUE;
1536  fuzzy_non = TRUE;
1537 #ifndef GRAPHICS_DISABLED
1538  mark_gap (blob_box, 8,
1539  prev_gap, prev_blob_box.width (),
1540  current_gap, next_blob_box.width (), next_gap);
1541 #endif
1542  }
1543  else if ((tosp_kern_gap_factor3 > 0) &&
1544  (prev_blob_box.width () > 0) &&
1545  (next_blob_box.width () > 0) &&
1546  (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
1548  (!suspected_punct_blob (row, prev_blob_box) &&
1549  !suspected_punct_blob (row, next_blob_box)))) {
1550  space = TRUE;
1551  fuzzy_non = TRUE;
1552 #ifndef GRAPHICS_DISABLED
1553  mark_gap (blob_box, 9,
1554  prev_gap, prev_blob_box.width (),
1555  current_gap, next_blob_box.width (), next_gap);
1556 #endif
1557  }
1558  }
1559  if (tosp_debug_level > 10)
1560  tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1561  "next_gap = %d\n", space ? 1 : 0, current_gap,
1562  prev_gap, next_gap);
1563  prev_gap_was_a_space = space && !(fuzzy_non);
1564  return space;
1565  }
1566 }
1567 
1568 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1569  BOOL8 result;
1570  result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1571  (((float) blob_box.width () / blob_box.height ()) <=
1573  return result;
1574 }
1575 
1576 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1577  BOOL8 result;
1578  if (tosp_wide_fraction > 0) {
1579  if (tosp_wide_aspect_ratio > 0)
1580  result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1581  (((float) blob_box.width () / blob_box.height ()) >
1583  else
1584  result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1585  }
1586  else
1587  result = !narrow_blob (row, blob_box);
1588  return result;
1589 }
1590 
1591 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1592  BOOL8 result;
1593  float baseline;
1594  float blob_x_centre;
1595  /* Find baseline of centre of blob */
1596  blob_x_centre = (box.right () + box.left ()) / 2.0;
1597  baseline = row->baseline.y (blob_x_centre);
1598 
1599  result = (box.height () <= 0.66 * row->xheight) ||
1600  (box.top () < baseline + row->xheight / 2.0) ||
1601  (box.bottom () > baseline + row->xheight / 2.0);
1602  return result;
1603 }
1604 
1605 
1606 void Textord::peek_at_next_gap(TO_ROW *row,
1607  BLOBNBOX_IT box_it,
1608  TBOX &next_blob_box,
1609  inT16 &next_gap,
1610  inT16 &next_within_xht_gap) {
1611  TBOX next_reduced_blob_box;
1612  TBOX bit_beyond;
1613  BLOBNBOX_IT reduced_box_it = box_it;
1614 
1615  next_blob_box = box_next (&box_it);
1616  next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1617  if (box_it.at_first ()) {
1618  next_gap = MAX_INT16;
1619  next_within_xht_gap = MAX_INT16;
1620  }
1621  else {
1622  bit_beyond = box_it.data ()->bounding_box ();
1623  next_gap = bit_beyond.left () - next_blob_box.right ();
1624  bit_beyond = reduced_box_next (row, &reduced_box_it);
1625  next_within_xht_gap =
1626  bit_beyond.left () - next_reduced_blob_box.right ();
1627  }
1628 }
1629 
1630 
1631 #ifndef GRAPHICS_DISABLED
1632 void Textord::mark_gap(
1633  TBOX blob, // blob following gap
1634  inT16 rule, // heuristic id
1635  inT16 prev_gap,
1636  inT16 prev_blob_width,
1637  inT16 current_gap,
1638  inT16 next_blob_width,
1639  inT16 next_gap) {
1640  ScrollView::Color col; //of ellipse marking flipped gap
1641 
1642  switch (rule) {
1643  case 1:
1644  col = ScrollView::RED;
1645  break;
1646  case 2:
1647  col = ScrollView::CYAN;
1648  break;
1649  case 3:
1650  col = ScrollView::GREEN;
1651  break;
1652  case 4:
1653  col = ScrollView::BLACK;
1654  break;
1655  case 5:
1656  col = ScrollView::MAGENTA;
1657  break;
1658  case 6:
1659  col = ScrollView::BLUE;
1660  break;
1661 
1662  case 7:
1663  col = ScrollView::WHITE;
1664  break;
1665  case 8:
1666  col = ScrollView::YELLOW;
1667  break;
1668  case 9:
1669  col = ScrollView::BLACK;
1670  break;
1671 
1672  case 20:
1673  col = ScrollView::CYAN;
1674  break;
1675  case 21:
1676  col = ScrollView::GREEN;
1677  break;
1678  case 22:
1679  col = ScrollView::MAGENTA;
1680  break;
1681  default:
1682  col = ScrollView::BLACK;
1683  }
1685  to_win->Pen(col);
1686  /* if (rule < 20)
1687  //interior_style(to_win, INT_SOLID, FALSE);
1688  else
1689  //interior_style(to_win, INT_HOLLOW, TRUE);*/
1690  //x radius
1691  to_win->Ellipse (current_gap / 2.0f,
1692  blob.height () / 2.0f, //y radius
1693  //x centre
1694  blob.left () - current_gap / 2.0f,
1695  //y centre
1696  blob.bottom () + blob.height () / 2.0f);
1697  }
1698  if (tosp_debug_level > 5)
1699  tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
1700  blob.left () - current_gap / 2, blob.bottom (), rule,
1701  prev_gap, prev_blob_width, current_gap,
1702  next_blob_width, next_gap);
1703 }
1704 #endif
1705 
1706 float Textord::find_mean_blob_spacing(WERD *word) {
1707  C_BLOB_IT cblob_it;
1708  TBOX blob_box;
1709  inT32 gap_sum = 0;
1710  inT16 gap_count = 0;
1711  inT16 prev_right;
1712 
1713  cblob_it.set_to_list (word->cblob_list ());
1714  if (!cblob_it.empty ()) {
1715  cblob_it.mark_cycle_pt ();
1716  prev_right = cblob_it.data ()->bounding_box ().right ();
1717  //first blob
1718  cblob_it.forward ();
1719  for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1720  blob_box = cblob_it.data ()->bounding_box ();
1721  gap_sum += blob_box.left () - prev_right;
1722  gap_count++;
1723  prev_right = blob_box.right ();
1724  }
1725  }
1726  if (gap_count > 0)
1727  return (gap_sum / (float) gap_count);
1728  else
1729  return 0.0f;
1730 }
1731 
1732 
1733 BOOL8 Textord::ignore_big_gap(TO_ROW *row,
1734  inT32 row_length,
1735  GAPMAP *gapmap,
1736  inT16 left,
1737  inT16 right) {
1738  inT16 gap = right - left + 1;
1739 
1740  if (tosp_ignore_big_gaps > 999)
1741  return FALSE; //Dont ignore
1742  if (tosp_ignore_big_gaps > 0)
1743  return (gap > tosp_ignore_big_gaps * row->xheight);
1744  if (gap > tosp_ignore_very_big_gaps * row->xheight)
1745  return TRUE;
1746  if (tosp_ignore_big_gaps == 0) {
1747  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1748  return TRUE;
1749  if ((gap > 1.75 * row->xheight) &&
1750  ((row_length > 35 * row->xheight) ||
1751  gapmap->table_gap (left, right)))
1752  return TRUE;
1753  }
1754  else {
1755  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1756  if ((gap > gapmap_big_gaps * row->xheight) &&
1757  gapmap->table_gap (left, right))
1758  return TRUE;
1759  }
1760  return FALSE;
1761 }
1762 
1763 
1764 /**********************************************************************
1765  * reduced_box_next
1766  *
1767  * Compute the bounding box of this blob with merging of x overlaps
1768  * but no pre-chopping.
1769  * Then move the iterator on to the start of the next blob.
1770  * DONT reduce the box for small things - eg punctuation.
1771  **********************************************************************/
1772 TBOX Textord::reduced_box_next(
1773  TO_ROW *row, // current row
1774  BLOBNBOX_IT *it // iterator to blobds
1775  ) {
1776  BLOBNBOX *blob; //current blob
1777  BLOBNBOX *head_blob; //place to store box
1778  TBOX full_box; //full blob boundg box
1779  TBOX reduced_box; //box of significant part
1780  inT16 left_above_xht; //ABOVE xht left limit
1781  inT16 new_left_above_xht; //ABOVE xht left limit
1782 
1783  blob = it->data ();
1784  if (blob->red_box_set ()) {
1785  reduced_box = blob->reduced_box ();
1786  do {
1787  it->forward();
1788  blob = it->data();
1789  }
1790  while (blob->cblob() == NULL || blob->joined_to_prev());
1791  return reduced_box;
1792  }
1793  head_blob = blob;
1794  full_box = blob->bounding_box ();
1795  reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1796  do {
1797  it->forward ();
1798  blob = it->data ();
1799  if (blob->cblob() == NULL)
1800  //was pre-chopped
1801  full_box += blob->bounding_box ();
1802  else if (blob->joined_to_prev ()) {
1803  reduced_box +=
1804  reduced_box_for_blob(blob, row, &new_left_above_xht);
1805  left_above_xht = MIN (left_above_xht, new_left_above_xht);
1806  }
1807  }
1808  //until next real blob
1809  while (blob->cblob() == NULL || blob->joined_to_prev());
1810 
1811  if ((reduced_box.width () > 0) &&
1812  ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1813  < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1814 #ifndef GRAPHICS_DISABLED
1817 #endif
1818  }
1819  else
1820  reduced_box = full_box;
1821  head_blob->set_reduced_box (reduced_box);
1822  return reduced_box;
1823 }
1824 
1825 
1826 /*************************************************************************
1827  * reduced_box_for_blob()
1828  * Find box for blob which is the same height and y position as the whole blob,
1829  * but whose left limit is the left most position of the blob ABOVE the
1830  * baseline and whose right limit is the right most position of the blob BELOW
1831  * the xheight.
1832  *
1833  *
1834  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1835  * "home". Perhaps we need something which say if the width ABOVE the
1836  * xht alone includes the whole of the reduced width, then use the full
1837  * blob box - Might still fail on italic F
1838  *
1839  * Alternatively we could be a little less severe and only reduce the
1840  * left and right edges by half the difference between the full box and
1841  * the reduced box.
1842  *
1843  * NOTE that we need to rotate all the coordinates as
1844  * find_blob_limits finds the y min and max within a specified x band
1845  *************************************************************************/
1846 TBOX Textord::reduced_box_for_blob(
1847  BLOBNBOX *blob,
1848  TO_ROW *row,
1849  inT16 *left_above_xht) {
1850  float baseline;
1851  float blob_x_centre;
1852  float left_limit;
1853  float right_limit;
1854  float junk;
1855  TBOX blob_box;
1856 
1857  /* Find baseline of centre of blob */
1858 
1859  blob_box = blob->bounding_box ();
1860  blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1861  baseline = row->baseline.y (blob_x_centre);
1862 
1863  /*
1864  Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1865  caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1866  */
1867  left_limit = (float) MAX_INT32;
1868  junk = (float) -MAX_INT32;
1869  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1870  static_cast<float>(MAX_INT16), left_limit, junk);
1871  if (left_limit > junk)
1872  *left_above_xht = MAX_INT16; //No area above xht
1873  else
1874  *left_above_xht = (inT16) floor (left_limit);
1875  /*
1876  Find reduced LH limit of blob - the left extent of the region ABOVE the
1877  baseline.
1878  */
1879  left_limit = (float) MAX_INT32;
1880  junk = (float) -MAX_INT32;
1881  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16),
1882  left_limit, junk);
1883 
1884  if (left_limit > junk)
1885  return TBOX (); //no area within xht so return empty box
1886  /*
1887  Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1888  */
1889  junk = (float) MAX_INT32;
1890  right_limit = (float) -MAX_INT32;
1891  find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16),
1892  (baseline + row->xheight), junk, right_limit);
1893  if (junk > right_limit)
1894  return TBOX (); //no area within xht so return empty box
1895 
1896  return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
1897  ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
1898 }
1899 } // namespace tesseract