dillo  3.0.5
About: dillo is a small, fast, extensible Web browser particularly suitable for older or smaller computers and embedded systems (but only limited or no support for frames, CSS, JavaScript, Java).
  Fossies Dox: dillo-3.0.5.tar.gz  ("inofficial" and yet experimental doxygen-generated source code documentation)  

hyphenator.cc
Go to the documentation of this file.
1 /*
2  * Dillo Widget
3  *
4  * Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
5  * Johannes Hofmann <Johannes.Hofmann@gmx.de>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program. If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 
22 #include "hyphenator.hh"
23 
24 #include "../lout/misc.hh"
25 #include "../lout/unicode.hh"
26 #include <limits.h>
27 #include <stdio.h>
28 #include <string.h>
29 
30 #define LEN 1000
31 
32 /*
33  * This is (or at least began as) a direct translation of Ned Batchelder's
34  * public-domain Python implementation at
35  * http://nedbatchelder.com/code/modules/hyphenate.py
36  */
37 
38 using namespace lout::object;
39 using namespace lout::container::typed;
40 using namespace lout::misc;
41 using namespace lout::unicode;
42 
43 namespace dw {
44 
45 HashTable <String, Hyphenator> *Hyphenator::hyphenators =
46  new HashTable <String, Hyphenator> (true, true);
47 
48 Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
49 {
50  trie = NULL; // As long we are not sure whether a pattern file can be read.
51 
52  int bufLen = strlen (patFile) + 5 + 1;
53  char *buf = new char[bufLen];
54  snprintf(buf, bufLen, "%s.trie", patFile);
55  FILE *trieF = fopen (buf, "r");
56  delete[] buf;
57 
58  if (trieF) {
59  trie = new Trie ();
60  if (trie->load (trieF) != 0) {
61  delete trie;
62  trie = NULL;
63  }
64  fclose (trieF);
65  }
66 
67  if (trie == NULL) {
68  TrieBuilder trieBuilder(pack);
69  FILE *patF = fopen (patFile, "r");
70  if (patF) {
71 
72  while (!feof (patF)) {
73  char buf[LEN + 1];
74  char *s = fgets (buf, LEN, patF);
75  if (s && s[0] != '%') { // ignore lines starting with '%' as comment
76  // TODO Better exit with an error, when the line is too long.
77  int l = strlen (s);
78  if (s[l - 1] == '\n')
79  s[l - 1] = 0;
80  insertPattern (&trieBuilder, s);
81  }
82  }
83 
84  trie = trieBuilder.createTrie ();
85  fclose (patF);
86  }
87  }
88 
89  exceptions = NULL; // Again, only instantiated when needed.
90 
91  FILE *excF = fopen (excFile, "r");
92  if (excF) {
94  while (!feof (excF)) {
95  char buf[LEN + 1];
96  char *s = fgets (buf, LEN, excF);
97  if (s && s[0] != '%') { // ignore lines starting with '%' as comment
98  // TODO Better exit with an error, when the line is too long.
99  int l = strlen (s);
100  if (s[l - 1] == '\n')
101  s[l - 1] = 0;
102  insertException (s);
103  }
104  }
105  fclose (excF);
106  }
107 }
108 
109 Hyphenator::~Hyphenator ()
110 {
111  delete trie;
112  delete exceptions;
113 }
114 
115 Hyphenator *Hyphenator::getHyphenator (const char *lang)
116 {
117  String *langString = new String (lang);
118 
119  Hyphenator *hyphenator = hyphenators->get (langString);
120  if (hyphenator)
121  delete langString;
122  else {
123  int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
124  char *patFile = new char[patFileLen];
125  snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
126  DILLO_LIBDIR, lang);
127  int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
128  char *excFile = new char[excFileLen];
129  snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
130  DILLO_LIBDIR, lang);
131 
132  //printf ("Loading hyphenation patterns for language '%s' from '%s' and "
133  // "exceptions from '%s' ...\n", lang, patFile, excFile);
134 
135  hyphenator = new Hyphenator (patFile, excFile);
136  hyphenators->put (langString, hyphenator);
137  delete[] patFile;
138  delete[] excFile;
139  }
140 
141  //lout::misc::StringBuffer sb;
142  //hyphenators->intoStringBuffer (&sb);
143  //printf ("%s\n", sb.getChars ());
144 
145  return hyphenator;
146 }
147 
148 void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
149 {
150  // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
151  // and a list of points [ 0, 1, 0, 3, 4 ].
152  int l = strlen (s);
153  char *chars = new char[l + 1];
154  SimpleVector<char> points (1);
155 
156  // TODO numbers consisting of multiple digits?
157  // TODO Encoding: This implementation works exactly like the Python
158  // implementation, based on UTF-8. Does this always work?
159  int numChars = 0;
160  for (int i = 0; s[i]; i++) {
161  if (s[i] >= '0' && s[i] <= '9') {
162  points.setSize(numChars + 1, '0');
163  points.set(numChars, s[i]);
164  } else {
165  chars[numChars++] = s[i];
166  }
167  }
168  chars[numChars] = 0;
169 
170  points.setSize(numChars + 2, '0');
171  points.set(numChars + 1, '\0');
172 
173  // Insert the pattern into the tree. Each character finds a dict
174  // another level down in the tree, and leaf nodes have the list of
175  // points.
176 
177  //printf("insertPattern %s\n", chars);
178 
179  trieBuilder->insert (chars, points.getArray ());
180  delete[] chars;
181 }
182 
183 void Hyphenator::insertException (char *s)
184 {
185  Vector<Integer> *breaks = new Vector<Integer> (1, true);
186 
187  int len = strlen (s);
188  for (int i = 0; i < len - 1; i++)
189  if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
190  breaks->put (new Integer (i - 2 * breaks->size()));
191 
192  char *noHyphens = new char[len - 2 * breaks->size() + 1];
193  int j = 0;
194  for (int i = 0; i < len; ) {
195  if(i < len - 1 &&
196  (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
197  i += 2;
198  else
199  noHyphens[j++] = s[i++];
200  }
201  noHyphens[j] = 0;
202 
203  exceptions->put (new String (noHyphens), breaks);
204  delete[] noHyphens;
205 }
206 
211 bool Hyphenator::isHyphenationCandidate (const char *word)
212 {
213  // Short words aren't hyphenated.
214  return (strlen (word) > 4); // TODO UTF-8?
215 }
216 
225 bool Hyphenator::isCharPartOfActualWord (char *s)
226 {
227 #if 0
228  // Return true when "s" points to a letter.
229  return (s[0] >= 'a' && s[0] <= 'z') ||
230  // UTF-8: starts with 0xc3
231  ((unsigned char)s[0] == 0xc3 &&
232  ((unsigned char)s[1] == 0xa4 /* ä */ ||
233  (unsigned char)s[1] == 0xb6 /* ö */ ||
234  (unsigned char)s[1] == 0xbc /* ü */ ||
235  (unsigned char)s[1] == 0x9f /* ß */ ));
236 #endif
237 
238  return isAlpha (decodeUtf8 (s));
239 }
240 
244 int *Hyphenator::hyphenateWord(core::Platform *platform,
245  const char *word, int *numBreaks)
246 {
247  if ((trie == NULL && exceptions ==NULL) || !isHyphenationCandidate (word)) {
248  *numBreaks = 0;
249  return NULL;
250  }
251 
252  char *wordLc = platform->textToLower (word, strlen (word));
253 
254  int start = 0;
255  SimpleVector <int> breakPos (1);
256 
257  // Split the original word up, ignore anything but characters, and
258  // collect all break points, so that they fit to the original
259  // word. (The latter is what the offset in the call of
260  // hyphenateSingleWord() is for.)
261  while (true) {
262  while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
263  start = platform->nextGlyph (wordLc, start);
264 
265  if (wordLc[start] == 0)
266  break;
267 
268  int end = start, i = end;
269  while (wordLc[i]) {
270  if (!isCharPartOfActualWord (wordLc + i))
271  break;
272  else
273  end = i;
274  i = platform->nextGlyph (wordLc, i);
275  }
276  end = platform->nextGlyph (wordLc, end);
277 
278  int nextStart;
279  if (wordLc[end]) {
280  nextStart = platform->nextGlyph (wordLc, end);
281  wordLc[end] = 0;
282  } else
283  nextStart = end;
284 
285  hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
286  start = nextStart;
287  }
288 
289  free (wordLc);
290 
291  *numBreaks = breakPos.size ();
292  if (*numBreaks == 0)
293  return NULL;
294  else {
295  return breakPos.detachArray ();
296  }
297 }
298 
303 void Hyphenator::hyphenateSingleWord(core::Platform *platform,
304  char *wordLc, int offset,
305  SimpleVector <int> *breakPos)
306 {
307  // If the word is an exception, get the stored points.
308  Vector <Integer> *exceptionalBreaks;
309  ConstString key (wordLc);
310  if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
311  for (int i = 0; i < exceptionalBreaks->size(); i++) {
312  breakPos->increase ();
313  breakPos->set (breakPos->size() - 1,
314  exceptionalBreaks->get(i)->getValue() + offset);
315  }
316  return;
317  }
318 
319 
320  // trie == NULL means that there is no pattern file.
321  if (trie == NULL)
322  return;
323 
324  char *work = new char[strlen (wordLc) + 3];
325  strcpy (work, ".");
326  strcat (work, wordLc);
327  strcat (work, ".");
328 
329  int l = strlen (work);
330  SimpleVector <int> points (l + 1);
331  points.setSize (l + 1, 0);
332 
333  for (int i = 0; i < l; i++) {
334  int state = trie->root;
335 
336  for (int j = i; j < l && trie->validState (state); j++) {
337  const char *p = trie->getData((unsigned char) work[j], &state);
338 
339  if (p) {
340  for (int k = 0; p[k]; k++)
341  points.set(i + k,
342  lout::misc::max (points.get (i + k), p[k] - '0'));
343  }
344  }
345  }
346 
347  delete[] work;
348 
349  // No hyphens in the first two chars or the last two.
350  // Characters are not bytes, so UTF-8 characters must be counted.
351  const char *s = nextUtf8Char (wordLc);
352  if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
353  // First two characters.
354  int bytesStart = s - wordLc;
355  for (int i = 0; i < bytesStart; i++)
356  points.set (i + 1, 0);
357 
358  // Last two characters: instead of iterating back from the end,
359  // we simply iterate from the start to the end and count the
360  // characters.
361 
362  int lenBytes = strlen (wordLc);
363  int lenUtf8 = numUtf8Chars (wordLc);
364  int bytesEnd = 0;
365 
366  s = wordLc;
367  for (int i = 0; s; s = nextUtf8Char (s), i++) {
368  if (i == lenUtf8 - 2)
369  bytesEnd = lenBytes - (s - wordLc);
370  }
371 
372  for (int i = 0; i < bytesEnd; i++)
373  points.set (points.size() - 2 - i, 0);
374  }
375 
376  // Examine the points to build the break point list.
377  int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
378  for (int i = 0; i < n; i++) {
379  if (points.get(i + 2) % 2) {
380  breakPos->increase ();
381  breakPos->set (breakPos->size() - 1, i + 1 + offset);
382  }
383  }
384 }
385 
386 Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
387 
388 TrieBuilder::TrieBuilder (int pack)
389 {
390  this->pack = pack;
391  dataList = new SimpleVector <DataEntry> (10000);
392  stateStack = new SimpleVector <StackEntry> (10);
393  tree = new SimpleVector <Trie::TrieNode> (20000);
394  dataZone = new ZoneAllocator (1024);
395  stateStackPush(0);
396 }
397 
398 TrieBuilder::~TrieBuilder ()
399 {
400  delete dataList;
401  delete stateStack;
402  delete tree;
403  delete dataZone;
404 }
405 
406 void TrieBuilder::insert (const char *key, const char *value)
407 {
408  dataList->increase ();
409  dataList->getLastRef ()->key = (unsigned char *) strdup(key);
410  dataList->getLastRef ()->value = dataZone->strdup (value);
411 }
412 
413 int TrieBuilder::keyCompare (const void *p1, const void *p2)
414 {
415  DataEntry *pd1 = (DataEntry *) p1;
416  DataEntry *pd2 = (DataEntry *) p2;
417 
418  return strcmp ((char *) pd1->key, (char *) pd2->key);
419 }
420 
421 int TrieBuilder::insertState (StackEntry *state, bool root)
422 {
423  int i, j;
424 
425  if (state->count == 0)
426  return 0;
427 
428  if (root) {
429  i = 0; // we reseve slot 0 for the root state
430  } else {
431  /* The bigger pack is the more slots we check and the smaller
432  * the trie will be, but CPU consumption also increases.
433  * Reasonable values for pack seemt to be between 256 and 1024.
434  */
435  i = tree->size () - pack + 2 * state->count;
436 
437  if (i < 256) // reserve first 256 entries for the root state
438  i = 256;
439  }
440 
441  for (;; i++) {
442  if (i + 256 > tree->size ())
443  tree->setSize (i + 256, trieNodeNull);
444 
445  for (j = 1; j < 256; j++) {
446  Trie::TrieNode *tn = tree->getRef(i + j);
447 
448  if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
449  break;
450  }
451 
452  if (j == 256) // found a suitable slot
453  break;
454  }
455 
456  for (int j = 1; j < 256; j++) {
457  Trie::TrieNode *tn = tree->getRef(i + j);
458 
459  if (state->next[j] || state->data[j]) {
460  tn->c = j;
461  tn->next = state->next[j];
462  tn->data = state->data[j];
463  }
464  }
465 
466  assert (root || i >= 256);
467  assert (!root || i == 0);
468  return i;
469 }
470 
471 void TrieBuilder::stateStackPush (unsigned char c)
472 {
473  stateStack->increase ();
474  StackEntry *e = stateStack->getLastRef ();
475  memset (e, 0, sizeof (StackEntry));
476  e->c = c;
477 }
478 
479 int TrieBuilder::stateStackPop ()
480 {
481  int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
482  unsigned char c = stateStack->getLastRef ()->c;
483  const char *data = stateStack->getLastRef ()->data1;
484 
485  stateStack->setSize (stateStack->size () - 1);
486 
487  if (stateStack->size () > 0) {
488  assert (stateStack->getLastRef ()->next[c] == 0);
489  assert (stateStack->getLastRef ()->data[c] == NULL);
490  stateStack->getLastRef ()->next[c] = next;
491  stateStack->getLastRef ()->data[c] = data;
492  stateStack->getLastRef ()->count++;
493  }
494 
495  return next;
496 }
497 
498 Trie *TrieBuilder::createTrie ()
499 {
500  // we need to sort the patterns as byte strings not as unicode
501  qsort (dataList->getArray (), dataList->size (),
502  sizeof (DataEntry), keyCompare);
503 
504  for (int i = 0; i < dataList->size (); i++) {
505  insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
506  free (dataList->getRef (i)->key);
507  }
508 
509  while (stateStack->size ())
510  stateStackPop ();
511 
512  int size = tree->size ();
513  Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
514  dataZone = NULL;
515  return trie;
516 }
517 
518 void TrieBuilder::insertSorted (unsigned char *s, const char *data)
519 {
520  int len = strlen((char*)s);
521 
522  for (int i = 0; i < len; i++) {
523  if (stateStack->size () > i + 1 &&
524  stateStack->getRef (i + 1)->c != s[i]) {
525  for (int j = stateStack->size () - 1; j >= i + 1; j--)
526  stateStackPop();
527  }
528 
529  if (i + 1 >= stateStack->size ())
530  stateStackPush(s[i]);
531  }
532 
533  while (stateStack->size () > len + 1)
534  stateStackPop();
535 
536  assert (stateStack->size () == len + 1);
537  stateStack->getLastRef ()->data1 = data;
538 }
539 
540 Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
541 {
542  this->array = array;
543  this->size = size;
544  this->freeArray = freeArray;
545  this->dataZone = dataZone;
546 }
547 
548 Trie::~Trie ()
549 {
550  delete dataZone;
551  if (freeArray)
552  free(array);
553 }
554 
555 void Trie::save (FILE *file)
556 {
557  for (int i = 0; i < size; i++) {
558  Trie::TrieNode *tn = &array[i];
559 
560  if (tn->data)
561  fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
562  else
563  fprintf(file, "%u, %u\n", tn->c, tn->next);
564  }
565 }
566 
567 int Trie::load (FILE *file)
568 {
569  int next, c, maxNext = 0;
570  SimpleVector <TrieNode> tree (100);
571  dataZone = new ZoneAllocator (1024);
572 
573  while (!feof (file)) {
574  char buf[LEN + 1];
575  char *s = fgets (buf, LEN, file);
576 
577  if (!s)
578  continue;
579 
580  char data[LEN + 1];
581  int n = sscanf (s, "%u, %u, %s", &c, &next, data);
582 
583  if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
584  tree.increase ();
585  tree.getLastRef ()->c = c;
586  tree.getLastRef ()->next = next;
587  if (n >= 3)
588  tree.getLastRef ()->data = dataZone->strdup (data);
589  else
590  tree.getLastRef ()->data = NULL;
591 
592  if (next > maxNext)
593  maxNext = next;
594  } else {
595  goto error;
596  }
597  }
598 
599  if (maxNext >= tree.size ())
600  goto error;
601 
602  size = tree.size ();
603  array = tree.detachArray ();
604  freeArray = true;
605  return 0;
606 
607 error:
608  delete dataZone;
609  dataZone = NULL;
610  return 1;
611 }
612 
613 } // namespace dw
dw::TrieBuilder::StackEntry::data
const char * data[256]
Definition: hyphenator.hh:56
exceptions
static Rule * exceptions
Definition: domain.c:22
lout::object::Integer
An object::Object wrapper for int's.
Definition: object.hh:92
lout::misc::SimpleVector::get
T get(int i) const
Return the one element, explicitly.
Definition: misc.hh:177
lout::object
Here, some common classes (or interfaces) are defined, to standardize the access to other classes.
Definition: object.cc:29
lout::misc::SimpleVector::setSize
void setSize(int newSize)
Set the size explicitly.
Definition: misc.hh:143
dw::Trie::TrieNode::next
uint16_t next
Definition: hyphenator.hh:14
lout::misc::SimpleVector::increase
void increase()
Increase the vector size by one.
Definition: misc.hh:136
lout::misc::ZoneAllocator
A simple allocator optimized to handle many small chunks of memory. The chunks can not be free'd indi...
Definition: misc.hh:549
dw::Trie::TrieNode
Definition: hyphenator.hh:12
lout::object::ConstString
An object::Object wrapper for constant strings (char*).
Definition: object.hh:111
lout::misc
Miscellaneous stuff, which does not fit anywhere else.
Definition: misc.cc:31
key
Definition: colors.c:28
lout::misc::SimpleVector::getArray
T * getArray() const
Definition: misc.hh:121
dw::TrieBuilder::StackEntry::next
int next[256]
Definition: hyphenator.hh:55
lout::misc::SimpleVector::detachArray
T * detachArray()
Definition: misc.hh:123
lout::unicode
Stuff dealing with Unicode characters: UTF-8, character classes etc.
Definition: unicode.cc:8
lout::misc::SimpleVector::getLastRef
T * getLastRef() const
Return the reference of the last element (convenience method).
Definition: misc.hh:201
lout::container::typed::Vector
Typed version of container::untyped::Vector.
Definition: container.hh:399
lout::object::String
An object::Object wrapper for strings (char*).
Definition: object.hh:134
lout::misc::SimpleVector::size
int size() const
Return the number of elements put into this vector.
Definition: misc.hh:119
lout::misc::max
T max(T a, T b)
Definition: misc.hh:20
lout::misc::min
T min(T a, T b)
Definition: misc.hh:19
lout::unicode::numUtf8Chars
int numUtf8Chars(const char *s)
Definition: unicode.cc:142
lout::misc::SimpleVector::set
void set(int i, T t)
Store an object in the vector.
Definition: misc.hh:222
lout::container::typed::HashTable
Typed version of container::untyped::HashTable.
Definition: container.hh:475
lout::container::typed::Vector::get
T * get(int pos) const
Definition: container.hh:412
dw::TrieBuilder
Definition: hyphenator.hh:50
dw::TrieBuilder::DataEntry::key
unsigned char * key
Definition: hyphenator.hh:61
dw::TrieBuilder::StackEntry
Definition: hyphenator.hh:52
dw::TrieBuilder::createTrie
Trie * createTrie()
Definition: hyphenator.cc:498
dw::TrieBuilder::insert
void insert(const char *key, const char *value)
Definition: hyphenator.cc:406
LEN
#define LEN
Definition: hyphenator.cc:30
dw::Trie::TrieNode::c
unsigned char c
Definition: hyphenator.hh:13
dw::core::Platform::nextGlyph
virtual int nextGlyph(const char *text, int idx)=0
Return the index of the next glyph in string text.
dw::Hyphenator
Definition: hyphenator.hh:86
dw::core::Platform::textToLower
virtual char * textToLower(const char *text, int len)=0
Return the string resulting from transforming text to lowercase.
dw::TrieBuilder::StackEntry::count
int count
Definition: hyphenator.hh:54
hyphenator.hh
lout::unicode::nextUtf8Char
const char * nextUtf8Char(const char *s)
Definition: unicode.cc:90
dw::TrieBuilder::DataEntry
Definition: hyphenator.hh:60
lout::object::Integer::getValue
int getValue()
Definition: object.hh:102
lout::container::typed::Vector::put
void put(T *newElement, int newPos=-1)
Definition: container.hh:405
lout::unicode::decodeUtf8
int decodeUtf8(const char *s)
Definition: unicode.cc:53
dw::TrieBuilder::StackEntry::c
unsigned char c
Definition: hyphenator.hh:53
lout::unicode::isAlpha
bool isAlpha(int ch)
Definition: unicode.cc:48
dw
Dw is in this namespace, or sub namespaces of this one.
Definition: alignedtextblock.cc:26
dw::Trie
Definition: hyphenator.hh:10
dw::core::Platform
An interface to encapsulate some platform dependencies.
Definition: platform.hh:16
lout::container::typed::Vector::size
int size() const
Definition: container.hh:414
lout::container::typed
This namespace provides thin wrappers, implemented as C++ templates, to gain type-safety.
Definition: container.hh:345
dw::Trie::TrieNode::data
const char * data
Definition: hyphenator.hh:15
lout::misc::SimpleVector
Simple (simpler than container::untyped::Vector and container::typed::Vector) template based vector.
Definition: misc.hh:71
error
static void error(char *msg)
Definition: dpidc.c:28