"Fossies" - the Fresh Open Source Software Archive

Member "tesseract-ocr/doc/html/class_u_n_i_c_h_a_r_s_e_t.html" (26 Oct 2012, 246271 Bytes) of package /linux/misc/old/tesseract-ocr-3.02.02-doc-html.tar.gz:


Caution: In this restricted "Fossies" environment the current HTML page may not be correctly presentated and may have some non-functional links. You can here alternatively try to browse the pure source code or just view or download the uninterpreted raw source code. If the rendering is insufficient you may try to find and view the page on the tesseract-ocr-3.02.02-doc-html.tar.gz project site itself.

Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UNICHARSET Class Reference

#include <unicharset.h>

List of all members.

Classes

struct  UNICHAR_PROPERTIES
struct  UNICHAR_SLOT

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}

Public Member Functions

 UNICHARSET ()
 ~UNICHARSET ()
const UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
const UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
int step (const char *str) const
bool encodable_string (const char *str, int *first_bad_position) const
const char *const id_to_unichar (UNICHAR_ID id) const
const char *const id_to_unichar_ext (UNICHAR_ID id) const
STRING debug_str (UNICHAR_ID id) const
STRING debug_str (const char *unichar_repr) const
void unichar_insert (const char *const unichar_repr)
bool contains_unichar_id (UNICHAR_ID unichar_id) const
bool contains_unichar (const char *const unichar_repr) const
bool contains_unichar (const char *const unichar_repr, int length) const
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
void delete_pointers_in_unichars ()
void clear ()
int size () const
void reserve (int unichars_number)
bool save_to_file (const char *const filename) const
bool save_to_file (FILE *file) const
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
bool load_from_inmemory_file (const char *const memory, int mem_size)
bool load_from_file (const char *const filename, bool skip_fragments)
bool load_from_file (const char *const filename)
bool load_from_file (FILE *file, bool skip_fragments)
bool load_from_file (FILE *file)
void post_load_setup ()
bool major_right_to_left () const
void set_black_and_whitelist (const char *blacklist, const char *whitelist)
void set_isalpha (UNICHAR_ID unichar_id, bool value)
void set_islower (UNICHAR_ID unichar_id, bool value)
void set_isupper (UNICHAR_ID unichar_id, bool value)
void set_isdigit (UNICHAR_ID unichar_id, bool value)
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
void set_isngram (UNICHAR_ID unichar_id, bool value)
void set_script (UNICHAR_ID unichar_id, const char *value)
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void set_normed (UNICHAR_ID unichar_id, const char *normed)
bool get_isalpha (UNICHAR_ID unichar_id) const
bool get_islower (UNICHAR_ID unichar_id) const
bool get_isupper (UNICHAR_ID unichar_id) const
bool get_isdigit (UNICHAR_ID unichar_id) const
bool get_ispunctuation (UNICHAR_ID unichar_id) const
bool get_isngram (UNICHAR_ID unichar_id) const
bool get_isprivate (UNICHAR_ID unichar_id) const
bool top_bottom_useful () const
void set_ranges_empty ()
void SetPropertiesFromOther (const UNICHARSET &src)
void ExpandRangesFromOther (const UNICHARSET &src)
void AppendOtherUnicharset (const UNICHARSET &src)
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
void get_width_range (UNICHAR_ID unichar_id, int *min_width, int *max_width) const
void set_width_range (UNICHAR_ID unichar_id, int min_width, int max_width)
void get_bearing_range (UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const
void set_bearing_range (UNICHAR_ID unichar_id, int min_bearing, int max_bearing)
void get_advance_range (UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const
void set_advance_range (UNICHAR_ID unichar_id, int min_advance, int max_advance)
int get_script (UNICHAR_ID unichar_id) const
unsigned int get_properties (UNICHAR_ID unichar_id) const
char get_chartype (UNICHAR_ID unichar_id) const
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
Direction get_direction (UNICHAR_ID unichar_id) const
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
bool get_isalpha (const char *const unichar_repr) const
bool get_islower (const char *const unichar_repr) const
bool get_isupper (const char *const unichar_repr) const
bool get_isdigit (const char *const unichar_repr) const
bool get_ispunctuation (const char *const unichar_repr) const
unsigned int get_properties (const char *const unichar_repr) const
char get_chartype (const char *const unichar_repr) const
int get_script (const char *const unichar_repr) const
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
bool get_isalpha (const char *const unichar_repr, int length) const
bool get_islower (const char *const unichar_repr, int length) const
bool get_isupper (const char *const unichar_repr, int length) const
bool get_isdigit (const char *const unichar_repr, int length) const
bool get_ispunctuation (const char *const unichar_repr, int length) const
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
int get_script (const char *const unichar_repr, int length) const
int get_script_table_size () const
const char * get_script_from_script_id (int id) const
int get_script_id_from_name (const char *script_name) const
bool is_null_script (const char *script) const
int add_script (const char *script)
bool get_enabled (UNICHAR_ID unichar_id) const
int null_sid () const
int common_sid () const
int latin_sid () const
int cyrillic_sid () const
int greek_sid () const
int han_sid () const
int hiragana_sid () const
int katakana_sid () const
int default_sid () const
bool script_has_upper_lower () const
bool script_has_xheight () const

Static Public Member Functions

static STRING debug_utf8_str (const char *str)

Static Public Attributes

static const char * kCustomLigatures [][2]

Detailed Description

Definition at line 127 of file unicharset.h.


Member Enumeration Documentation

Enumerator:
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 135 of file unicharset.h.


Constructor & Destructor Documentation

UNICHARSET::UNICHARSET ( )

Definition at line 146 of file unicharset.cpp.

:
unichars(NULL),
ids(),
size_used(0),
size_reserved(0),
script_table(NULL),
script_table_size_used(0),
null_script("NULL") {
clear();
}
UNICHARSET::~UNICHARSET ( )

Definition at line 157 of file unicharset.cpp.

{
clear();
}

Member Function Documentation

int UNICHARSET::add_script ( const char *  script)

Definition at line 866 of file unicharset.cpp.

{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0)
return i;
}
if (script_table_size_reserved == 0) {
script_table_size_reserved = 8;
script_table = new char*[script_table_size_reserved];
}
if (script_table_size_used + 1 >= script_table_size_reserved) {
char** new_script_table = new char*[script_table_size_reserved * 2];
memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
delete[] script_table;
script_table = new_script_table;
script_table_size_reserved = 2 * script_table_size_reserved;
}
script_table[script_table_size_used] = new char[strlen(script) + 1];
strcpy(script_table[script_table_size_used], script);
return script_table_size_used++;
}
void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 375 of file unicharset.cpp.

{
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
// Only use fully valid entries.
tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
utf8, src_props.min_bottom, src_props.max_bottom,
src_props.min_top, src_props.max_top,
src_props.min_width, src_props.max_width,
src_props.min_bearing, src_props.max_bearing,
src_props.min_advance, src_props.max_advance);
continue;
}
int id = size_used;
if (contains_unichar(utf8)) {
id = unichar_to_id(utf8);
} else {
unichars[id].properties.SetRangesEmpty();
}
if (!unichars[id].properties.AnyRangeEmpty()) {
// Just expand current ranges.
unichars[id].properties.ExpandRangesFrom(src_props);
} else {
// Copy properties from src_props.
unichars[id].properties.CopyFrom(src_props);
// Setup the script_id, other_case and mirror properly.
const char* script = src.get_script_from_script_id(src_props.script_id);
unichars[id].properties.script_id = add_script(script);
const char* other_case = src.id_to_unichar(src_props.other_case);
if (!contains_unichar(other_case)) {
unichar_insert(other_case);
unichars[size_used - 1].properties.SetRangesEmpty();
// Other_case will have its ranges set later as it is contained in src.
}
unichars[id].properties.other_case = unichar_to_id(other_case);
const char* mirror_str = src.id_to_unichar(src_props.mirror);
if (!contains_unichar(mirror_str)) {
unichar_insert(mirror_str);
unichars[size_used - 1].properties.SetRangesEmpty();
// Mirror will have its ranges set later as it is contained in src.
}
unichars[id].properties.mirror = unichar_to_id(mirror_str);
}
}
}
void UNICHARSET::clear ( )
inline

Definition at line 233 of file unicharset.h.

{
if (script_table != NULL) {
for (int i = 0; i < script_table_size_used; ++i)
delete[] script_table[i];
delete[] script_table;
script_table = NULL;
script_table_size_used = 0;
}
if (unichars != NULL) {
delete[] unichars;
unichars = NULL;
}
script_table_size_reserved = 0;
size_reserved = 0;
size_used = 0;
ids.clear();
top_bottom_set_ = false;
script_has_upper_lower_ = false;
script_has_xheight_ = false;
null_sid_ = 0;
common_sid_ = 0;
latin_sid_ = 0;
cyrillic_sid_ = 0;
greek_sid_ = 0;
han_sid_ = 0;
hiragana_sid_ = 0;
katakana_sid_ = 0;
}
int UNICHARSET::common_sid ( ) const
inline

Definition at line 753 of file unicharset.h.

{ return common_sid_; }
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 543 of file unicharset.cpp.

{
return ids.contains(unichar_repr);
}
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 547 of file unicharset.cpp.

{
if (length == 0) {
return false;
}
return ids.contains(unichar_repr, length);
}
bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 209 of file unicharset.h.

{
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
unichar_id >= 0;
}
int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 755 of file unicharset.h.

{ return cyrillic_sid_; }
STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 285 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
if (fragment) {
return fragment->to_string();
}
const char* str = id_to_unichar(id);
STRING result = debug_utf8_str(str);
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
if (get_isalpha(id)) {
if (get_islower(id))
result += "a";
else if (get_isupper(id))
result += "A";
else
result += "x";
}
// Append 0 if a digit.
if (get_isdigit(id)) {
result += "0";
}
// Append p is a punctuation symbol.
if (get_ispunctuation(id)) {
result += "p";
}
return result;
}
STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 200 of file unicharset.h.

{
return debug_str(unichar_to_id(unichar_repr));
}
STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 261 of file unicharset.cpp.

{
STRING result = str;
result += " [";
int step = 1;
// Chop into unicodes and code each as hex.
for (int i = 0; str[i] != '\0'; i += step) {
char hex[sizeof(int) * 2 + 1];
step = UNICHAR::utf8_step(str + i);
if (step == 0) {
step = 1;
sprintf(hex, "%x", str[i]);
} else {
UNICHAR ch(str + i, step);
sprintf(hex, "%x", ch.first_uni());
}
result += hex;
result += " ";
}
result += "]";
return result;
}
int UNICHARSET::default_sid ( ) const
inline

Definition at line 760 of file unicharset.h.

{ return default_sid_; }
void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 223 of file unicharset.h.

{
for (int i = 0; i < size_used; ++i) {
if (unichars[i].properties.fragment != NULL) {
delete unichars[i].properties.fragment;
unichars[i].properties.fragment = NULL;
}
}
}
bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 220 of file unicharset.cpp.

{
for (int i = 0, len = strlen(str); i < len; ) {
int increment = step(str + i);
if (increment == 0) {
if (first_bad_position) *first_bad_position = i;
return false;
}
i += increment;
}
return true;
}
bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 555 of file unicharset.cpp.

{
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 361 of file unicharset.cpp.

{
for (int ch = 0; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Expand just the ranges from properties.
unichars[ch].properties.ExpandRangesFrom(properties);
}
}
}
void UNICHARSET::get_advance_range ( UNICHAR_ID  unichar_id,
int *  min_advance,
int *  max_advance 
) const
inline

Definition at line 531 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_advance = *max_advance = 0;
return;
}
*min_advance = unichars[unichar_id].properties.min_advance;
*max_advance = unichars[unichar_id].properties.max_advance;
}
void UNICHARSET::get_bearing_range ( UNICHAR_ID  unichar_id,
int *  min_bearing,
int *  max_bearing 
) const
inline

Definition at line 510 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_bearing = *max_bearing = 0;
return;
}
*min_bearing = unichars[unichar_id].properties.min_bearing;
*max_bearing = unichars[unichar_id].properties.max_bearing;
}
char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 502 of file unicharset.cpp.

{
if (this->get_isupper(id)) return 'A';
if (this->get_islower(id)) return 'a';
if (this->get_isalpha(id)) return 'x';
if (this->get_isdigit(id)) return '0';
if (this->get_ispunctuation(id)) return 'p';
return 0;
}
char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 647 of file unicharset.h.

{
return get_chartype(unichar_to_id(unichar_repr));
}
Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 579 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
return unichars[unichar_id].properties.direction;
}
bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 747 of file unicharset.h.

{
return unichars[unichar_id].properties.enabled;
}
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 610 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return NULL;
return unichars[unichar_id].properties.fragment;
}
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 660 of file unicharset.h.

{
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 392 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isalpha;
}
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 617 of file unicharset.h.

{
return get_isalpha(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 670 of file unicharset.h.

{
return get_isalpha(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 413 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isdigit;
}
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 632 of file unicharset.h.

{
return get_isdigit(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 691 of file unicharset.h.

{
return get_isdigit(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 399 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.islower;
}
bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 622 of file unicharset.h.

{
return get_islower(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 677 of file unicharset.h.

{
return get_islower(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 427 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isngram;
}
bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 316 of file unicharset.cpp.

{
UNICHAR uc(id_to_unichar(unichar_id), -1);
int uni = uc.first_uni();
return (uni >= 0xE000 && uni <= 0xF8FF);
}
bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 420 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.ispunctuation;
}
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 637 of file unicharset.h.

{
return get_ispunctuation(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 698 of file unicharset.h.

{
return get_ispunctuation(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 406 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isupper;
}
bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 627 of file unicharset.h.

{
return get_isupper(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 684 of file unicharset.h.

{
return get_isupper(unichar_to_id(unichar_repr, length));
}
UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 586 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
return unichars[unichar_id].properties.mirror;
}
const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 704 of file unicharset.h.

{
return unichars[unichar_id].properties.normed.string();
}
UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 572 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
return unichars[unichar_id].properties.other_case;
}
unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 487 of file unicharset.cpp.

{
unsigned int properties = 0;
if (this->get_isalpha(id))
properties |= ISALPHA_MASK;
if (this->get_islower(id))
properties |= ISLOWER_MASK;
if (this->get_isupper(id))
properties |= ISUPPER_MASK;
if (this->get_isdigit(id))
properties |= ISDIGIT_MASK;
if (this->get_ispunctuation(id))
properties |= ISPUNCTUATION_MASK;
return properties;
}
unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 643 of file unicharset.h.

{
return get_properties(unichar_to_id(unichar_repr));
}
int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 552 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
return unichars[unichar_id].properties.script_id;
}
int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 654 of file unicharset.h.

{
return get_script(unichar_to_id(unichar_repr));
}
int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 712 of file unicharset.h.

{
return get_script(unichar_to_id(unichar_repr, length));
}
const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 723 of file unicharset.h.

{
if (id >= script_table_size_used || id < 0)
return null_script;
return script_table[id];
}
int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 944 of file unicharset.cpp.

{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script_name, script_table[i]) == 0)
return i;
}
return 0; // 0 is always the null_script
}
int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 718 of file unicharset.h.

{
return script_table_size_used;
}
void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 459 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_bottom = *min_top = 0;
*max_bottom = *max_top = 256; // kBlnCellHeight
return;
}
*min_bottom = unichars[unichar_id].properties.min_bottom;
*max_bottom = unichars[unichar_id].properties.max_bottom;
*min_top = unichars[unichar_id].properties.min_top;
*max_top = unichars[unichar_id].properties.max_top;
}
void UNICHARSET::get_width_range ( UNICHAR_ID  unichar_id,
int *  min_width,
int *  max_width 
) const
inline

Definition at line 489 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_width = 0;
*max_width = 256; // kBlnCellHeight;
return;
}
*min_width = unichars[unichar_id].properties.min_width;
*max_width = unichars[unichar_id].properties.max_width;
}
int UNICHARSET::greek_sid ( ) const
inline

Definition at line 756 of file unicharset.h.

{ return greek_sid_; }
int UNICHARSET::han_sid ( ) const
inline

Definition at line 757 of file unicharset.h.

{ return han_sid_; }
int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 758 of file unicharset.h.

{ return hiragana_sid_; }
const char *const UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 233 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
return unichars[id].representation;
}
const char *const UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 241 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
// Resolve from the kCustomLigatures table if this is a private encoding.
if (get_isprivate(id)) {
const char* ch = id_to_unichar(id);
for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
if (!strcmp(ch, kCustomLigatures[i][1])) {
return kCustomLigatures[i][0];
}
}
}
// Otherwise return the stored representation.
return unichars[id].representation;
}
bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 737 of file unicharset.h.

{
return script == null_script;
}
int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 759 of file unicharset.h.

{ return katakana_sid_; }
int UNICHARSET::latin_sid ( ) const
inline

Definition at line 754 of file unicharset.h.

{ return latin_sid_; }
bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 298 of file unicharset.h.

{
FILE* file = fopen(filename, "rb");
if (file == NULL) return false;
bool result = load_from_file(file, skip_fragments);
fclose(file);
return result;
}
bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 306 of file unicharset.h.

{
return load_from_file(filename, false);
}
bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 638 of file unicharset.cpp.

{
LocalFilePointer lfp(file);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 313 of file unicharset.h.

{ return load_from_file(file, false); }
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 618 of file unicharset.cpp.

{
InMemoryFilePointer mem_fp(memory, mem_size);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 291 of file unicharset.h.

{
return load_from_inmemory_file(memory, mem_size, false);
}
bool UNICHARSET::major_right_to_left ( ) const

Definition at line 813 of file unicharset.cpp.

{
int ltr_count = 0;
int rtl_count = 0;
for (int id = 0; id < size_used; ++id) {
int dir = get_direction(id);
if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
}
return rtl_count > ltr_count;
}
int UNICHARSET::null_sid ( ) const
inline

Definition at line 752 of file unicharset.h.

{ return null_sid_; }
void UNICHARSET::post_load_setup ( )

Definition at line 750 of file unicharset.cpp.

{
// Number of alpha chars with the case property minus those without,
// in order to determine that half the alpha chars have case.
int net_case_alphas = 0;
int x_height_alphas = 0;
int cap_height_alphas = 0;
top_bottom_set_ = false;
for (UNICHAR_ID id = 0; id < size_used; ++id) {
int min_bottom = 0;
int max_bottom = MAX_UINT8;
int min_top = 0;
int max_top = MAX_UINT8;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
if (min_top > 0)
top_bottom_set_ = true;
if (get_isalpha(id)) {
if (get_islower(id) || get_isupper(id))
++net_case_alphas;
else
--net_case_alphas;
if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
++x_height_alphas;
else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
++cap_height_alphas;
}
}
script_has_upper_lower_ = net_case_alphas > 0;
script_has_xheight_ = script_has_upper_lower_ ||
(x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
null_sid_ = get_script_id_from_name(null_script);
ASSERT_HOST(null_sid_ == 0);
common_sid_ = get_script_id_from_name("Common");
latin_sid_ = get_script_id_from_name("Latin");
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
greek_sid_ = get_script_id_from_name("Greek");
han_sid_ = get_script_id_from_name("Han");
hiragana_sid_ = get_script_id_from_name("Hiragana");
katakana_sid_ = get_script_id_from_name("Katakana");
// Compute default script. Use the highest-counting alpha script, that is
// not the common script, as that still contains some "alphas".
int* script_counts = new int[script_table_size_used];
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
for (int id = 0; id < size_used; ++id) {
if (get_isalpha(id)) {
++script_counts[get_script(id)];
}
}
default_sid_ = 0;
for (int s = 1; s < script_table_size_used; ++s) {
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
default_sid_ = s;
}
delete [] script_counts;
}
void UNICHARSET::reserve ( int  unichars_number)

Definition at line 161 of file unicharset.cpp.

{
if (unichars_number > size_reserved) {
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
for (int i = 0; i < size_used; ++i)
unichars_new[i] = unichars[i];
for (int j = size_used; j < unichars_number; ++j) {
unichars_new[j].properties.script_id = add_script(null_script);
}
delete[] unichars;
unichars = unichars_new;
size_reserved = unichars_number;
}
}
bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 273 of file unicharset.h.

{
FILE* file = fopen(filename, "w+b");
if (file == NULL) return false;
bool result = save_to_file(file);
fclose(file);
return result;
}
bool UNICHARSET::save_to_file ( FILE *  file) const

Definition at line 560 of file unicharset.cpp.

{
fprintf(file, "%d\n", this->size());
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
int min_bottom, max_bottom, min_top, max_top;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
int min_width, max_width;
get_width_range(id, &min_width, &max_width);
int min_bearing, max_bearing;
get_bearing_range(id, &min_bearing, &max_bearing);
int min_advance, max_advance;
get_advance_range(id, &min_advance, &max_advance);
unsigned int properties = this->get_properties(id);
if (strcmp(this->id_to_unichar(id), " ") == 0) {
fprintf(file, "%s %x %s %d\n", "NULL", properties,
this->get_other_case(id));
} else {
fprintf(file,
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
this->id_to_unichar(id), properties,
min_bottom, max_bottom, min_top, max_top, min_width, max_width,
min_bearing, max_bearing, min_advance, max_advance,
this->get_other_case(id), this->get_direction(id),
this->get_mirror(id), this->get_normed_unichar(id),
this->debug_str(id).string());
}
}
return true;
}
bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 763 of file unicharset.h.

{
return script_has_upper_lower_;
}
bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 770 of file unicharset.h.

{
return script_has_xheight_;
}
void UNICHARSET::set_advance_range ( UNICHAR_ID  unichar_id,
int  min_advance,
int  max_advance 
)
inline

Definition at line 541 of file unicharset.h.

{
unichars[unichar_id].properties.min_advance =
static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
unichars[unichar_id].properties.max_advance =
static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
}
void UNICHARSET::set_bearing_range ( UNICHAR_ID  unichar_id,
int  min_bearing,
int  max_bearing 
)
inline

Definition at line 520 of file unicharset.h.

{
unichars[unichar_id].properties.min_bearing =
static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
unichars[unichar_id].properties.max_bearing =
static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
}
void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist 
)

Definition at line 829 of file unicharset.cpp.

{
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
// Set everything to default
for (int ch = 0; ch < size_used; ++ch)
unichars[ch].properties.enabled = def_enabled;
int ch_step;
if (!def_enabled) {
// Enable the whitelist.
for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
ch_step = step(whitelist + w_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
if (u_id != INVALID_UNICHAR_ID) {
unichars[u_id].properties.enabled = true;
}
} else {
ch_step = 1;
}
}
}
if (blacklist != NULL && blacklist[0] != '\0') {
// Disable the blacklist.
for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
ch_step = step(blacklist + b_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
if (u_id != INVALID_UNICHAR_ID) {
unichars[u_id].properties.enabled = false;
}
} else {
ch_step = 1;
}
}
}
}
void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 377 of file unicharset.h.

{
unichars[unichar_id].properties.direction = value;
}
void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 336 of file unicharset.h.

{
unichars[unichar_id].properties.isalpha = value;
}
void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 351 of file unicharset.h.

{
unichars[unichar_id].properties.isdigit = value;
}
void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 341 of file unicharset.h.

{
unichars[unichar_id].properties.islower = value;
}
void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 361 of file unicharset.h.

{
unichars[unichar_id].properties.isngram = value;
}
void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 356 of file unicharset.h.

{
unichars[unichar_id].properties.ispunctuation = value;
}
void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 346 of file unicharset.h.

{
unichars[unichar_id].properties.isupper = value;
}
void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 382 of file unicharset.h.

{
unichars[unichar_id].properties.mirror = mirror;
}
void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 387 of file unicharset.h.

{
unichars[unichar_id].properties.normed = normed;
}
void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 372 of file unicharset.h.

{
unichars[unichar_id].properties.other_case = other_case;
}
void UNICHARSET::set_ranges_empty ( )

Definition at line 324 of file unicharset.cpp.

{
for (int id = 0; id < size_used; ++id) {
unichars[id].properties.SetRangesEmpty();
}
}
void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 367 of file unicharset.h.

{
unichars[unichar_id].properties.script_id = add_script(value);
}
void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 473 of file unicharset.h.

{
unichars[unichar_id].properties.min_bottom =
static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.max_bottom =
static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.min_top =
static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
unichars[unichar_id].properties.max_top =
static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
}
void UNICHARSET::set_width_range ( UNICHAR_ID  unichar_id,
int  min_width,
int  max_width 
)
inline

Definition at line 500 of file unicharset.h.

{
unichars[unichar_id].properties.min_width =
static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
unichars[unichar_id].properties.max_width =
static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
}
void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)

Definition at line 333 of file unicharset.cpp.

{
for (int ch = 0; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Setup the script_id, other_case, and mirror properly.
const char* script = src.get_script_from_script_id(properties.script_id);
properties.script_id = add_script(script);
const char* other_case = src.id_to_unichar(properties.other_case);
if (contains_unichar(other_case)) {
properties.other_case = unichar_to_id(other_case);
} else {
properties.other_case = ch;
}
const char* mirror_str = src.id_to_unichar(properties.mirror);
if (contains_unichar(mirror_str)) {
properties.mirror = unichar_to_id(mirror_str);
} else {
properties.mirror = ch;
}
unichars[ch].properties.CopyFrom(properties);
}
}
}
int UNICHARSET::size ( ) const
inline

Definition at line 264 of file unicharset.h.

{
return size_used;
}
int UNICHARSET::step ( const char *  str) const

Definition at line 192 of file unicharset.cpp.

{
// Find the length of the first matching unicharset member.
int minlength = ids.minmatch(str);
if (minlength == 0)
return 0; // Empty string or illegal char.
int goodlength = minlength;
while (goodlength <= UNICHAR_LEN) {
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
return goodlength; // This length works!
// The next char is illegal so find the next usable length.
do {
++goodlength;
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
!ids.contains(str, goodlength));
if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
// This does not constitute a good length!
return minlength;
}
}
// Search to find a subsequent legal char failed so return the minlength.
return minlength;
}
UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 593 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
if (unichars[unichar_id].properties.islower) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 601 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
if (unichars[unichar_id].properties.isupper) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 438 of file unicharset.h.

{
return top_bottom_set_;
}
void UNICHARSET::unichar_insert ( const char *const  unichar_repr)

Definition at line 511 of file unicharset.cpp.

{
if (!ids.contains(unichar_repr)) {
if (strlen(unichar_repr) > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
int(strlen(unichar_repr)), unichar_repr);
return;
}
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
else
reserve(2 * size_used);
}
strcpy(unichars[size_used].representation, unichar_repr);
this->set_script(size_used, null_script);
// If the given unichar_repr represents a fragmented character, set
// fragment property to a pointer to CHAR_FRAGMENT class instance with
// information parsed from the unichar representation. Use the script
// of the base unichar for the fragmented character if possible.
this->unichars[size_used].properties.fragment = frag;
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
this->unichars[size_used].properties.script_id =
this->get_script(frag->get_unichar());
}
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
++size_used;
}
}
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 176 of file unicharset.cpp.

{
return ids.contains(unichar_repr) ?
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 181 of file unicharset.cpp.

{
assert(length > 0 && length <= UNICHAR_LEN);
return ids.contains(unichar_repr, length) ?
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}

Member Data Documentation

const char * UNICHARSET::kCustomLigatures
static
Initial value:
{
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
}

Definition at line 132 of file unicharset.h.


The documentation for this class was generated from the following files: