ucs.cc (ocrad-0.24) | : | ucs.cc (ocrad-0.25) | ||
---|---|---|---|---|
/* GNU Ocrad - Optical Character Recognition program | /* GNU Ocrad - Optical Character Recognition program | |||
Copyright (C) 2003-2014 Antonio Diaz Diaz. | Copyright (C) 2003-2015 Antonio Diaz Diaz. | |||
This program is free software: you can redistribute it and/or modify | This program is free software: you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | |||
the Free Software Foundation, either version 2 of the License, or | the Free Software Foundation, either version 2 of the License, or | |||
(at your option) any later version. | (at your option) any later version. | |||
This program is distributed in the hope that it will be useful, | This program is distributed in the hope that it will be useful, | |||
but WITHOUT ANY WARRANTY; without even the implied warranty of | but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
GNU General Public License for more details. | GNU General Public License for more details. | |||
skipping to change at line 49 | skipping to change at line 49 | |||
case CIACUTE: | case CIACUTE: | |||
case CICIRCU: | case CICIRCU: | |||
case CIDIAER: | case CIDIAER: | |||
case CIDOT : return 'I'; | case CIDOT : return 'I'; | |||
case CNTILDE: return 'N'; | case CNTILDE: return 'N'; | |||
case COGRAVE: | case COGRAVE: | |||
case COACUTE: | case COACUTE: | |||
case COCIRCU: | case COCIRCU: | |||
case COTILDE: | case COTILDE: | |||
case CODIAER: return 'O'; | case CODIAER: return 'O'; | |||
case CSCEDI : return 'S'; | case CSCEDI : | |||
case CSCARON: return 'S'; | ||||
case CUGRAVE: | case CUGRAVE: | |||
case CUACUTE: | case CUACUTE: | |||
case CUCIRCU: | case CUCIRCU: | |||
case CUDIAER: return 'U'; | case CUDIAER: return 'U'; | |||
case CYACUTE: return 'Y'; | case CYACUTE: | |||
case CYDIAER: return 'Y'; | ||||
case CZCARON: return 'Z'; | ||||
case SAGRAVE: | case SAGRAVE: | |||
case SAACUTE: | case SAACUTE: | |||
case SACIRCU: | case SACIRCU: | |||
case SATILDE: | case SATILDE: | |||
case SADIAER: | case SADIAER: | |||
case SARING : return 'a'; | case SARING : return 'a'; | |||
case SCCEDI : return 'c'; | case SCCEDI : return 'c'; | |||
case SEGRAVE: | case SEGRAVE: | |||
case SEACUTE: | case SEACUTE: | |||
case SECIRCU: | case SECIRCU: | |||
skipping to change at line 78 | skipping to change at line 81 | |||
case SIACUTE: | case SIACUTE: | |||
case SICIRCU: | case SICIRCU: | |||
case SIDIAER: | case SIDIAER: | |||
case SINODOT: return 'i'; | case SINODOT: return 'i'; | |||
case SNTILDE: return 'n'; | case SNTILDE: return 'n'; | |||
case SOGRAVE: | case SOGRAVE: | |||
case SOACUTE: | case SOACUTE: | |||
case SOCIRCU: | case SOCIRCU: | |||
case SOTILDE: | case SOTILDE: | |||
case SODIAER: return 'o'; | case SODIAER: return 'o'; | |||
case SSCEDI : return 's'; | case SSCEDI : | |||
case SSCARON: return 's'; | ||||
case SUGRAVE: | case SUGRAVE: | |||
case SUACUTE: | case SUACUTE: | |||
case SUCIRCU: | case SUCIRCU: | |||
case SUDIAER: return 'u'; | case SUDIAER: return 'u'; | |||
case SYACUTE: | case SYACUTE: | |||
case SYDIAER: return 'y'; | case SYDIAER: return 'y'; | |||
case SZCARON: return 'z'; | ||||
default: return 0; | default: return 0; | |||
} | } | |||
} | } | |||
int UCS::compose( const int letter, const int accent ) | int UCS::compose( const int letter, const int accent ) | |||
{ | { | |||
switch( letter ) | switch( letter ) | |||
{ | { | |||
case 'A': if( accent == '\'') return CAACUTE; | case 'A': if( accent == '\'') return CAACUTE; | |||
if( accent == '`' ) return CAGRAVE; | if( accent == '`' ) return CAGRAVE; | |||
skipping to change at line 118 | skipping to change at line 123 | |||
case 'O': if( accent == '\'') return COACUTE; | case 'O': if( accent == '\'') return COACUTE; | |||
if( accent == '`' ) return COGRAVE; | if( accent == '`' ) return COGRAVE; | |||
if( accent == '^' ) return COCIRCU; | if( accent == '^' ) return COCIRCU; | |||
if( accent == ':' ) return CODIAER; break; | if( accent == ':' ) return CODIAER; break; | |||
case 'S': return CSCARON; | case 'S': return CSCARON; | |||
case 'U': | case 'U': | |||
case 'V': if( accent == '\'') return CUACUTE; | case 'V': if( accent == '\'') return CUACUTE; | |||
if( accent == '`' ) return CUGRAVE; | if( accent == '`' ) return CUGRAVE; | |||
if( accent == '^' ) return CUCIRCU; | if( accent == '^' ) return CUCIRCU; | |||
if( accent == ':' ) return CUDIAER; break; | if( accent == ':' ) return CUDIAER; break; | |||
case 'Y': if( accent == '\'') return CYACUTE; | ||||
if( accent == ':' ) return CYDIAER; break; | ||||
case 'Z': return CZCARON; | case 'Z': return CZCARON; | |||
case 'a': if( accent == '\'') return SAACUTE; | case 'a': if( accent == '\'') return SAACUTE; | |||
if( accent == '`' ) return SAGRAVE; | if( accent == '`' ) return SAGRAVE; | |||
if( accent == '^' ) return SACIRCU; | if( accent == '^' ) return SACIRCU; | |||
if( accent == ':' ) return SADIAER; break; | if( accent == ':' ) return SADIAER; break; | |||
case 'e': if( accent == '\'') return SEACUTE; | case 'e': if( accent == '\'') return SEACUTE; | |||
if( accent == '`' ) return SEGRAVE; | if( accent == '`' ) return SEGRAVE; | |||
if( accent == '^' ) return SECIRCU; | if( accent == '^' ) return SECIRCU; | |||
if( accent == ':' ) return SEDIAER; break; | if( accent == ':' ) return SEDIAER; break; | |||
case '9': | case '9': | |||
skipping to change at line 164 | skipping to change at line 171 | |||
bool UCS::isalnum( const int code ) | bool UCS::isalnum( const int code ) | |||
{ | { | |||
return ( UCS::isalpha( code ) || UCS::isdigit( code ) ); | return ( UCS::isalpha( code ) || UCS::isdigit( code ) ); | |||
} | } | |||
bool UCS::isalpha( const int code ) | bool UCS::isalpha( const int code ) | |||
{ | { | |||
return ( ( code < 128 && std::isalpha( code ) ) || base_letter( code ) ); | return ( ( code < 128 && std::isalpha( code ) ) || base_letter( code ) ); | |||
} | } | |||
bool UCS::isdigit( const int code ) | ||||
{ | ||||
return ( code <= '9' && code >= '0' ); | ||||
} | ||||
bool UCS::ishigh( const int code ) | bool UCS::ishigh( const int code ) | |||
{ | { | |||
if( isupper( code ) || isdigit( code ) ) return true; | if( isupper( code ) || isdigit( code ) ) return true; | |||
switch( code ) | switch( code ) | |||
{ | { | |||
case 'b': case 'd': case 'f': case 'g': case 'h': case 'i': case 'j': | case 'b': case 'd': case 'f': case 'g': case 'h': case 'i': case 'j': | |||
case 'k': case 'l': case 'p': case 'q': case 't': case 'y': case '|': | case 'k': case 'l': case 'p': case 'q': case 't': case 'y': case '|': | |||
return true; | return true; | |||
default : return false; | default : return false; | |||
} | } | |||
skipping to change at line 228 | skipping to change at line 230 | |||
switch( code ) | switch( code ) | |||
{ | { | |||
case 'c': case 'o': case 's': case 'u': case 'v': case 'w': | case 'c': case 'o': case 's': case 'u': case 'v': case 'w': | |||
case 'x': case 'z': return true; | case 'x': case 'z': return true; | |||
default : return false; | default : return false; | |||
} | } | |||
} | } | |||
bool UCS::isspace( const int code ) | bool UCS::isspace( const int code ) | |||
{ | { | |||
return ( code < 128 && std::isspace( code ) ); | return ( code < 128 && std::isspace( code ) ) || code == 0xA0; | |||
} | } | |||
bool UCS::isupper( const int code ) | bool UCS::isupper( const int code ) | |||
{ | { | |||
if( code < 128 && std::isupper( code ) ) return true; | if( code < 128 && std::isupper( code ) ) return true; | |||
const int base = base_letter( code ); | const int base = base_letter( code ); | |||
return ( base && std::isupper( base ) ); | return ( base && std::isupper( base ) ); | |||
} | } | |||
bool UCS::isupper_normal_width( const int code ) | ||||
{ | ||||
if( code >= 128 || !std::isupper( code ) ) return false; | ||||
switch( code ) | ||||
{ | ||||
case 'I': case 'J': case 'L': case 'M': case 'Q': case 'W': return false; | ||||
default : return true; | ||||
} | ||||
} | ||||
bool UCS::isvowel( int code ) | bool UCS::isvowel( int code ) | |||
{ | { | |||
if( code >= 128 ) code = base_letter( code ); | if( code >= 128 ) code = base_letter( code ); | |||
if( !code || !std::isalpha( code ) ) return false; | if( !code || !std::isalpha( code ) ) return false; | |||
code = std::tolower( code ); | code = std::tolower( code ); | |||
return ( code == 'a' || code == 'e' || code == 'i' || | return ( code == 'a' || code == 'e' || code == 'i' || | |||
code == 'o' || code == 'u' ); | code == 'o' || code == 'u' ); | |||
} | } | |||
unsigned char UCS::map_to_byte( const int code ) | unsigned char UCS::map_to_byte( const int code ) | |||
skipping to change at line 261 | skipping to change at line 273 | |||
switch( code ) | switch( code ) | |||
{ | { | |||
case CGBREVE: return 0xD0; | case CGBREVE: return 0xD0; | |||
case SGBREVE: return 0xF0; | case SGBREVE: return 0xF0; | |||
case CIDOT : return 0xDD; | case CIDOT : return 0xDD; | |||
case SINODOT: return 0xFD; | case SINODOT: return 0xFD; | |||
case CSCEDI : return 0xDE; | case CSCEDI : return 0xDE; | |||
case SSCEDI : return 0xFE; | case SSCEDI : return 0xFE; | |||
case CSCARON: return 0xA6; | case CSCARON: return 0xA6; | |||
case SSCARON: return 0xA8; | case SSCARON: return 0xA8; | |||
case CYDIAER: return 0xBE; | ||||
case CZCARON: return 0xB4; | case CZCARON: return 0xB4; | |||
case SZCARON: return 0xB8; | case SZCARON: return 0xB8; | |||
case EURO : return 0xA4; | case EURO : return 0xA4; | |||
default : return 0; | default : return 0; | |||
} | } | |||
} | } | |||
int UCS::map_to_ucs( const unsigned char ch ) | ||||
{ | ||||
switch( ch ) | ||||
{ | ||||
case 0xA4: return EURO; | ||||
case 0xA6: return CSCARON; | ||||
case 0xA8: return SSCARON; | ||||
case 0xB4: return CZCARON; | ||||
case 0xB8: return SZCARON; | ||||
case 0xBC: return CLIGOE; | ||||
case 0xBD: return SLIGOE; | ||||
case 0xBE: return CYDIAER; | ||||
} | ||||
return ch; | ||||
} | ||||
// does not work for 'code' == 0 | ||||
const char * UCS::ucs_to_utf8( const int code ) | const char * UCS::ucs_to_utf8( const int code ) | |||
{ | { | |||
static char s[7]; | static char s[7]; | |||
if( code < 0 || code > 0x7FFFFFFF ) { s[0] = 0; return s; } // invalid code | if( code < 0 || code > 0x7FFFFFFF ) { s[0] = 0; return s; } // invalid code | |||
if( code < 128 ) { s[0] = code; s[1] = 0; return s; } // plain ascii | if( code < 128 ) { s[0] = code; s[1] = 0; return s; } // plain ascii | |||
int i, mask; | int i, mask; | |||
if( code < 0x800 ) { i = 2; mask = 0xC0; } // 110X XXXX | if( code < 0x800 ) { i = 2; mask = 0xC0; } // 110X XXXX | |||
else if( code < 0x10000 ) { i = 3; mask = 0xE0; } // 1110 XXXX | else if( code < 0x10000 ) { i = 3; mask = 0xE0; } // 1110 XXXX | |||
else if( code < 0x200000 ) { i = 4; mask = 0xF0; } // 1111 0XXX | else if( code < 0x200000 ) { i = 4; mask = 0xF0; } // 1111 0XXX | |||
else if( code < 0x4000000 ) { i = 5; mask = 0xF8; } // 1111 10XX | else if( code < 0x4000000 ) { i = 5; mask = 0xF8; } // 1111 10XX | |||
else { i = 6; mask = 0xFC; } // 1111 110X | else { i = 6; mask = 0xFC; } // 1111 110X | |||
s[i] = 0; --i; | s[i] = 0; --i; | |||
int d = 0; | int d = 0; | |||
for( ; i > 0; --i, d+=6 ) | for( ; i > 0; --i, d += 6 ) | |||
s[i] = 0x80 | ( ( code >> d ) & 0x3F ); // 10XX XXXX | s[i] = 0x80 | ( ( code >> d ) & 0x3F ); // 10XX XXXX | |||
s[0] = mask | ( code >> d ); | s[0] = mask | ( code >> d ); | |||
return s; | return s; | |||
} | } | |||
int UCS::to_nearest_digit( const int code ) | int UCS::to_nearest_digit( const int code ) | |||
{ | { | |||
switch( code ) | switch( code ) | |||
{ | { | |||
case 'D': | ||||
case 'O': | case 'O': | |||
case 'Q': | case 'Q': | |||
case 'o': return '0'; | case 'o': return '0'; | |||
case '|': | ||||
case 'I': | case 'I': | |||
case 'L': | case 'L': | |||
case 'l': | case 'l': | |||
case '|': | ||||
case SINODOT: return '1'; | case SINODOT: return '1'; | |||
case 'Z': | case 'Z': | |||
case 'z': return '2'; | case 'z': return '2'; | |||
case 'A': | case 'A': | |||
case 'q': return '4'; | case 'q': return '4'; | |||
case 'S': | case 'S': | |||
case 's': return '5'; | case 's': return '5'; | |||
case 'G': | case 'G': | |||
case 'b': | case 'b': | |||
case SOACUTE: return '6'; | case SOACUTE: return '6'; | |||
skipping to change at line 341 | skipping to change at line 372 | |||
case '8': return 'B'; | case '8': return 'B'; | |||
case '9': return 'g'; | case '9': return 'g'; | |||
default: return code; | default: return code; | |||
} | } | |||
} | } | |||
int UCS::to_nearest_upper_num( const int code ) | int UCS::to_nearest_upper_num( const int code ) | |||
{ | { | |||
switch( code ) | switch( code ) | |||
{ | { | |||
case '(': | ||||
case '[': return 'C'; | ||||
case 'l': | case 'l': | |||
case '|': return 'I'; | case '|': return 'I'; | |||
case DEG: return 'O'; | case DEG: return 'O'; | |||
case MICRO: return 'U'; | case MICRO: return 'U'; | |||
case POW1: | case POW1: | |||
case SINODOT: return '1'; | case SINODOT: return '1'; | |||
case POW2: return '2'; | case POW2: return '2'; | |||
case POW3: return '3'; | case POW3: return '3'; | |||
case 'q': return '4'; | case 'q': return '4'; | |||
case 'b': | case 'b': | |||
skipping to change at line 388 | skipping to change at line 421 | |||
case SIACUTE: return CIACUTE; | case SIACUTE: return CIACUTE; | |||
case SICIRCU: return CICIRCU; | case SICIRCU: return CICIRCU; | |||
case SIDIAER: return CIDIAER; | case SIDIAER: return CIDIAER; | |||
case SNTILDE: return CNTILDE; | case SNTILDE: return CNTILDE; | |||
case SOGRAVE: return COGRAVE; | case SOGRAVE: return COGRAVE; | |||
case SOACUTE: return COACUTE; | case SOACUTE: return COACUTE; | |||
case SOCIRCU: return COCIRCU; | case SOCIRCU: return COCIRCU; | |||
case SOTILDE: return COTILDE; | case SOTILDE: return COTILDE; | |||
case SODIAER: return CODIAER; | case SODIAER: return CODIAER; | |||
case SSCEDI : return CSCEDI; | case SSCEDI : return CSCEDI; | |||
case SSCARON: return CSCARON; | ||||
case SUGRAVE: return CUGRAVE; | case SUGRAVE: return CUGRAVE; | |||
case SUACUTE: return CUACUTE; | case SUACUTE: return CUACUTE; | |||
case SUCIRCU: return CUCIRCU; | case SUCIRCU: return CUCIRCU; | |||
case SUDIAER: return CUDIAER; | case SUDIAER: return CUDIAER; | |||
case SYACUTE: return CYACUTE; | case SYACUTE: return CYACUTE; | |||
case SYDIAER: return CYDIAER; | ||||
case SZCARON: return CZCARON; | ||||
default: return code; | default: return code; | |||
} | } | |||
} | } | |||
End of changes. 18 change blocks. | ||||
12 lines changed or deleted | 48 lines changed or added |