24 #include "../lout/misc.hh" 25 #include "../lout/unicode.hh" 48 Hyphenator::Hyphenator (
const char *patFile,
const char *excFile,
int pack)
52 int bufLen = strlen (patFile) + 5 + 1;
53 char *buf =
new char[bufLen];
54 snprintf(buf, bufLen,
"%s.trie", patFile);
55 FILE *trieF = fopen (buf,
"r");
60 if (trie->load (trieF) != 0) {
69 FILE *patF = fopen (patFile,
"r");
72 while (!feof (patF)) {
74 char *s = fgets (buf,
LEN, patF);
75 if (s && s[0] !=
'%') {
80 insertPattern (&trieBuilder, s);
91 FILE *excF = fopen (excFile,
"r");
94 while (!feof (excF)) {
96 char *s = fgets (buf,
LEN, excF);
97 if (s && s[0] !=
'%') {
100 if (s[l - 1] ==
'\n')
109 Hyphenator::~Hyphenator ()
119 Hyphenator *hyphenator = hyphenators->get (langString);
123 int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
124 char *patFile =
new char[patFileLen];
125 snprintf (patFile, patFileLen,
"%s/hyphenation/%s.pat",
127 int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
128 char *excFile =
new char[excFileLen];
129 snprintf (excFile, excFileLen,
"%s/hyphenation/%s.exc",
135 hyphenator =
new Hyphenator (patFile, excFile);
136 hyphenators->put (langString, hyphenator);
148 void Hyphenator::insertPattern (
TrieBuilder *trieBuilder,
char *s)
153 char *chars =
new char[l + 1];
160 for (
int i = 0; s[i]; i++) {
161 if (s[i] >=
'0' && s[i] <=
'9') {
162 points.
setSize(numChars + 1,
'0');
163 points.
set(numChars, s[i]);
165 chars[numChars++] = s[i];
170 points.
setSize(numChars + 2,
'0');
171 points.
set(numChars + 1,
'\0');
183 void Hyphenator::insertException (
char *s)
187 int len = strlen (s);
188 for (
int i = 0; i < len - 1; i++)
189 if((
unsigned char)s[i] == 0xc2 && (
unsigned char)s[i + 1] == 0xad)
192 char *noHyphens =
new char[len - 2 * breaks->
size() + 1];
194 for (
int i = 0; i < len; ) {
196 (
unsigned char)s[i] == 0xc2 && (
unsigned char)s[i + 1] == 0xad)
199 noHyphens[j++] = s[i++];
211 bool Hyphenator::isHyphenationCandidate (
const char *word)
214 return (strlen (word) > 4);
225 bool Hyphenator::isCharPartOfActualWord (
char *s)
229 return (s[0] >=
'a' && s[0] <=
'z') ||
231 ((
unsigned char)s[0] == 0xc3 &&
232 ((
unsigned char)s[1] == 0xa4 ||
233 (
unsigned char)s[1] == 0xb6 ||
234 (
unsigned char)s[1] == 0xbc ||
235 (
unsigned char)s[1] == 0x9f ));
245 const char *word,
int *numBreaks)
247 if ((trie == NULL &&
exceptions ==NULL) || !isHyphenationCandidate (word)) {
252 char *wordLc = platform->
textToLower (word, strlen (word));
262 while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
263 start = platform->
nextGlyph (wordLc, start);
265 if (wordLc[start] == 0)
268 int end = start, i = end;
270 if (!isCharPartOfActualWord (wordLc + i))
280 nextStart = platform->
nextGlyph (wordLc, end);
285 hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
291 *numBreaks = breakPos.
size ();
304 char *wordLc,
int offset,
311 for (
int i = 0; i < exceptionalBreaks->
size(); i++) {
313 breakPos->
set (breakPos->
size() - 1,
324 char *work =
new char[strlen (wordLc) + 3];
326 strcat (work, wordLc);
329 int l = strlen (work);
333 for (
int i = 0; i < l; i++) {
334 int state = trie->root;
336 for (
int j = i; j < l && trie->validState (state); j++) {
337 const char *p = trie->getData((
unsigned char) work[j], &state);
340 for (
int k = 0; p[k]; k++)
354 int bytesStart = s - wordLc;
355 for (
int i = 0; i < bytesStart; i++)
356 points.
set (i + 1, 0);
362 int lenBytes = strlen (wordLc);
368 if (i == lenUtf8 - 2)
369 bytesEnd = lenBytes - (s - wordLc);
372 for (
int i = 0; i < bytesEnd; i++)
373 points.
set (points.
size() - 2 - i, 0);
378 for (
int i = 0; i < n; i++) {
379 if (points.
get(i + 2) % 2) {
381 breakPos->
set (breakPos->
size() - 1, i + 1 + offset);
388 TrieBuilder::TrieBuilder (
int pack)
398 TrieBuilder::~TrieBuilder ()
406 void TrieBuilder::insert (
const char *
key,
const char *value)
408 dataList->increase ();
409 dataList->getLastRef ()->key = (
unsigned char *) strdup(
key);
410 dataList->getLastRef ()->value = dataZone->strdup (value);
413 int TrieBuilder::keyCompare (
const void *p1,
const void *p2)
418 return strcmp ((
char *) pd1->
key, (
char *) pd2->
key);
425 if (state->
count == 0)
435 i = tree->size () - pack + 2 * state->
count;
442 if (i + 256 > tree->size ())
443 tree->setSize (i + 256, trieNodeNull);
445 for (j = 1; j < 256; j++) {
448 if (tn->
c == j || ((state->
next[j] || state->
data[j]) && tn->
c != 0))
456 for (
int j = 1; j < 256; j++) {
459 if (state->
next[j] || state->
data[j]) {
466 assert (root || i >= 256);
467 assert (!root || i == 0);
471 void TrieBuilder::stateStackPush (
unsigned char c)
473 stateStack->increase ();
479 int TrieBuilder::stateStackPop ()
481 int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
482 unsigned char c = stateStack->getLastRef ()->c;
483 const char *data = stateStack->getLastRef ()->data1;
485 stateStack->setSize (stateStack->size () - 1);
487 if (stateStack->size () > 0) {
488 assert (stateStack->getLastRef ()->next[c] == 0);
489 assert (stateStack->getLastRef ()->data[c] == NULL);
490 stateStack->getLastRef ()->next[c] = next;
491 stateStack->getLastRef ()->data[c] = data;
492 stateStack->getLastRef ()->count++;
498 Trie *TrieBuilder::createTrie ()
501 qsort (dataList->getArray (), dataList->size (),
504 for (
int i = 0; i < dataList->size (); i++) {
505 insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
506 free (dataList->getRef (i)->key);
509 while (stateStack->size ())
512 int size = tree->size ();
513 Trie *trie =
new Trie(tree->detachArray(), size,
true, dataZone);
518 void TrieBuilder::insertSorted (
unsigned char *s,
const char *data)
520 int len = strlen((
char*)s);
522 for (
int i = 0; i < len; i++) {
523 if (stateStack->size () > i + 1 &&
524 stateStack->getRef (i + 1)->c != s[i]) {
525 for (
int j = stateStack->size () - 1; j >= i + 1; j--)
529 if (i + 1 >= stateStack->size ())
530 stateStackPush(s[i]);
533 while (stateStack->size () > len + 1)
536 assert (stateStack->size () == len + 1);
537 stateStack->getLastRef ()->data1 = data;
544 this->freeArray = freeArray;
545 this->dataZone = dataZone;
555 void Trie::save (FILE *file)
557 for (
int i = 0; i < size; i++) {
561 fprintf(file,
"%u, %u, %s\n", tn->
c, tn->
next, tn->
data);
563 fprintf(file,
"%u, %u\n", tn->
c, tn->
next);
567 int Trie::load (FILE *file)
569 int next, c, maxNext = 0;
573 while (!feof (file)) {
575 char *s = fgets (buf,
LEN, file);
581 int n = sscanf (s,
"%u, %u, %s", &c, &next, data);
583 if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
588 tree.
getLastRef ()->data = dataZone->strdup (data);
599 if (maxNext >= tree.
size ())