textline.cc (ocrad-0.24) | : | textline.cc (ocrad-0.25) | ||
---|---|---|---|---|
/* GNU Ocrad - Optical Character Recognition program | /* GNU Ocrad - Optical Character Recognition program | |||
Copyright (C) 2003-2014 Antonio Diaz Diaz. | Copyright (C) 2003-2015 Antonio Diaz Diaz. | |||
This program is free software: you can redistribute it and/or modify | This program is free software: you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | |||
the Free Software Foundation, either version 2 of the License, or | the Free Software Foundation, either version 2 of the License, or | |||
(at your option) any later version. | (at your option) any later version. | |||
This program is distributed in the hope that it will be useful, | This program is distributed in the hope that it will be useful, | |||
but WITHOUT ANY WARRANTY; without even the implied warranty of | but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
GNU General Public License for more details. | GNU General Public License for more details. | |||
skipping to change at line 32 | skipping to change at line 32 | |||
#include <string> | #include <string> | |||
#include <vector> | #include <vector> | |||
#include <stdint.h> | #include <stdint.h> | |||
#include "common.h" | #include "common.h" | |||
#include "histogram.h" | #include "histogram.h" | |||
#include "rational.h" | #include "rational.h" | |||
#include "rectangle.h" | #include "rectangle.h" | |||
#include "track.h" | #include "track.h" | |||
#include "ucs.h" | #include "ucs.h" | |||
#include "user_filter.h" | ||||
#include "bitmap.h" | #include "bitmap.h" | |||
#include "blob.h" | #include "blob.h" | |||
#include "character.h" | #include "character.h" | |||
#include "page_image.h" | #include "page_image.h" | |||
#include "textline.h" | #include "textline.h" | |||
namespace { | namespace { | |||
// Return the character position >= first preceding a big gap or eol. | // Returns the character position >= first preceding a big gap or eol. | |||
// | // | |||
int find_big_gap( const Textline & line, const int first, | int find_big_gap( const Textline & line, const int first, | |||
const int space_width_limit ) | const int space_width_limit ) | |||
{ | { | |||
int i = first; | int i = first; | |||
while( i + 1 < line.characters() ) | while( i + 1 < line.characters() ) | |||
{ | { | |||
const Character & c1 = line.character( i ); | const Character & c1 = line.character( i ); | |||
const Character & c2 = line.character( i + 1 ); | const Character & c2 = line.character( i + 1 ); | |||
const int gap = c2.left() - c1.right() - 1; | const int gap = c2.left() - c1.right() - 1; | |||
skipping to change at line 120 | skipping to change at line 121 | |||
for( int i = 0; i < characters(); ++i ) | for( int i = 0; i < characters(); ++i ) | |||
if( cpv[i]->h_includes( col ) ) return cpv[i]; | if( cpv[i]->h_includes( col ) ) return cpv[i]; | |||
return 0; | return 0; | |||
} | } | |||
Rectangle Textline::charbox( const Character & c ) const | Rectangle Textline::charbox( const Character & c ) const | |||
{ | { | |||
return Rectangle( c.left(), top( c.hcenter() ), c.right(), bottom( c.hcenter() ) ); | return Rectangle( c.left(), top( c.hcenter() ), c.right(), bottom( c.hcenter() ) ); | |||
} | } | |||
bool Textline::is_key_character( const int i ) const | ||||
{ | ||||
if( i < big_initials_ || i >= characters() ) | ||||
Ocrad::internal_error( "is_key_character, index out of bounds." ); | ||||
return ( cpv[i]->isalnum() && cpv[i]->guess( 0 ).code != 'J' && | ||||
cpv[i]->height() < 2 * height() && 2 * cpv[i]->height() > height() ); | ||||
} | ||||
void Textline::delete_character( const int i ) | void Textline::delete_character( const int i ) | |||
{ | { | |||
if( i < 0 || i >= characters() ) | if( i < 0 || i >= characters() ) | |||
Ocrad::internal_error( "delete_character, index out of bounds." ); | Ocrad::internal_error( "delete_character, index out of bounds." ); | |||
if( i < big_initials_ ) --big_initials_; | if( i < big_initials_ ) --big_initials_; | |||
delete cpv[i]; cpv.erase( cpv.begin() + i ); | delete cpv[i]; cpv.erase( cpv.begin() + i ); | |||
} | } | |||
int Textline::shift_characterp( Character * const p, const bool big ) | int Textline::shift_characterp( Character * const p, const bool big ) | |||
{ | { | |||
skipping to change at line 279 | skipping to change at line 288 | |||
void Textline::dprint( const Control & control, const bool graph, | void Textline::dprint( const Control & control, const bool graph, | |||
const bool recursive ) const | const bool recursive ) const | |||
{ | { | |||
if( graph || recursive ) | if( graph || recursive ) | |||
{ | { | |||
Histogram hist; | Histogram hist; | |||
for( int i = 0; i < characters(); ++i ) | for( int i = 0; i < characters(); ++i ) | |||
if( !character(i).maybe(' ') ) | if( !character(i).maybe(' ') ) | |||
hist.add_sample( character(i).height() ); | hist.add_sample( character(i).height() ); | |||
std::fprintf( control.outfile, "mean height = %d, median height = %d, track | std::fprintf( control.outfile, "mean height = %d, median height = %d, track | |||
segments = %d\n", | segments = %d, big initials = %d\n", | |||
mean_height(), hist.median(), segments() ); | mean_height(), hist.median(), segments(), big_initials_ ); | |||
} | } | |||
for( int i = 0; i < characters(); ++i ) | for( int i = 0; i < characters(); ++i ) | |||
{ | { | |||
const Character & c = character( i ); | const Character & c = character( i ); | |||
if( i < big_initials_ ) c.dprint( control, c, graph, recursive ); | if( i < big_initials_ ) c.dprint( control, c, graph, recursive ); | |||
else c.dprint( control, charbox( c ), graph, recursive ); | else c.dprint( control, charbox( c ), graph, recursive ); | |||
} | } | |||
std::fputs( "\n", control.outfile ); | std::fputs( "\n", control.outfile ); | |||
} | } | |||
skipping to change at line 326 | skipping to change at line 335 | |||
c.only_guess( UCS::toupper( code ), 0 ); | c.only_guess( UCS::toupper( code ), 0 ); | |||
} | } | |||
} | } | |||
else c.recognize1( charset, charbox( c ) ); | else c.recognize1( charset, charbox( c ) ); | |||
} | } | |||
} | } | |||
void Textline::apply_filter( const Filter::Type filter ) | void Textline::apply_filter( const Filter::Type filter ) | |||
{ | { | |||
bool modified = false; | bool modified = false; | |||
for( int i = characters() - 1; i >= 0; --i ) | ||||
{ | ||||
Character & c = character( i ); | ||||
if( !c.guesses() ) continue; | ||||
c.apply_filter( filter ); | ||||
if( !c.guesses() ) { delete_character( i ); modified = true; } | ||||
} | ||||
if( filter == Filter::same_height ) | if( filter == Filter::same_height ) | |||
{ | { | |||
Histogram hist; | Histogram hist; | |||
for( int i = 0; i < characters(); ++i ) | for( int i = 0; i < characters(); ++i ) | |||
if( !character(i).maybe(' ') ) | if( !character(i).maybe(' ') ) | |||
hist.add_sample( character(i).height() ); | hist.add_sample( character(i).height() ); | |||
const int median_height = hist.median(); | const int median_height = hist.median(); | |||
for( int i = characters() - 1; i >= 0; --i ) | for( int i = characters() - 1; i >= 0; --i ) | |||
if( !character(i).maybe(' ') && | if( !character(i).maybe(' ') && | |||
!Ocrad::similar( character(i).height(), median_height, 13, 2 ) ) | !Ocrad::similar( character(i).height(), median_height, 10, 2 ) ) | |||
{ delete_character( i ); modified = true; } | { delete_character( i ); modified = true; } | |||
} | } | |||
if( modified ) // remove leadind/trailing/duplicate spaces | else | |||
{ | ||||
for( int i = characters() - 1; i >= 0; --i ) | for( int i = characters() - 1; i >= 0; --i ) | |||
if( character(i).maybe(' ') && | { | |||
( i == 0 || i == characters() - 1 || character(i-1).maybe(' ') ) ) | Character & c = character( i ); | |||
delete_character( i ); | if( !c.guesses() ) continue; | |||
c.apply_filter( filter ); | ||||
if( !c.guesses() && filter != Filter::upper_num_mark ) | ||||
{ delete_character( i ); modified = true; } | ||||
} | ||||
if( filter == Filter::upper_num_mark ) | ||||
join_broken_unrecognized_characters(); | ||||
} | ||||
if( modified ) remove_leadind_trailing_duplicate_spaces(); | ||||
} | ||||
void Textline::apply_user_filter( const User_filter & user_filter ) | ||||
{ | ||||
bool modified = false; | ||||
for( int i = characters() - 1; i >= 0; --i ) | ||||
{ | ||||
Character & c = character( i ); | ||||
if( !c.guesses() ) continue; | ||||
c.apply_user_filter( user_filter ); | ||||
if( !c.guesses() && user_filter.discard() ) | ||||
{ delete_character( i ); modified = true; } | ||||
} | ||||
if( user_filter.mark() ) join_broken_unrecognized_characters(); | ||||
if( modified ) remove_leadind_trailing_duplicate_spaces(); | ||||
} | ||||
void Textline::join_broken_unrecognized_characters() | ||||
{ | ||||
for( int i = characters() - 1; i > 0; --i ) | ||||
if( !character(i).guesses() && | ||||
character(i).h_overlaps( character( i - 1 ) ) ) | ||||
delete_character( i ); | ||||
} | ||||
void Textline::remove_leadind_trailing_duplicate_spaces() | ||||
{ | ||||
for( int i = characters() - 1; i >= 0; --i ) | ||||
if( character(i).maybe(' ') && | ||||
( i == 0 || i == characters() - 1 || character(i-1).maybe(' ') ) ) | ||||
delete_character( i ); | ||||
} | } | |||
End of changes. 9 change blocks. | ||||
17 lines changed or deleted | 59 lines changed or added |