w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

n2builder.cpp
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: n2builder.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
17 *
18 * Builds Normalizer2 data and writes a binary .nrm file.
19 * For the file format see source/common/normalizer2impl.h.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <vector>
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/ucptrie.h"
33 #include "unicode/udata.h"
34 #include "unicode/umutablecptrie.h"
35 #include "unicode/uniset.h"
36 #include "unicode/unistr.h"
37 #include "unicode/usetiter.h"
38 #include "unicode/ustring.h"
39 #include "charstr.h"
40 #include "extradata.h"
41 #include "hash.h"
42 #include "normalizer2impl.h"
43 #include "norms.h"
44 #include "toolutil.h"
45 #include "unewdata.h"
46 #include "uvectr32.h"
47 #include "writesrc.h"
48 
49 #if !UCONFIG_NO_NORMALIZATION
50 
51 /* UDataInfo cf. udata.h */
53  sizeof(UDataInfo),
54  0,
55 
59  0,
60 
61  { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
62  { 4, 0, 0, 0 }, /* formatVersion */
63  { 11, 0, 0, 0 } /* dataVersion (Unicode version) */
64 };
65 
67 
69 public:
70  struct Range {
72  };
73 
75  const Range *nextRange() {
77  return ranges+rangeIndex++;
78  } else {
79  return NULL;
80  }
81  }
82 private:
83  static const Range ranges[4];
85 };
86 
90  // JAMO_T_BASE+1: not U+11A7
93 };
94 
96  norms(errorCode),
97  phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
98  norm16TrieBytes(nullptr), norm16TrieLength(0) {
100  memset(indexes, 0, sizeof(indexes));
101  memset(smallFCD, 0, sizeof(smallFCD));
102 }
103 
105  delete[] norm16TrieBytes;
106 }
107 
108 void
110  UVersionInfo nullVersion={ 0, 0, 0, 0 };
114  0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
115  ) {
118  fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
119  buffer, v);
121  }
123 }
124 
126  if(p!=NULL) {
127  if(p->mappingType!=Norm::NONE) {
129  (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
130  ) {
131  fprintf(stderr,
132  "error in gennorm2 phase %d: "
133  "not permitted to override mapping for U+%04lX from phase %d\n",
134  (int)phase, (long)c, (int)p->mappingPhase);
136  }
137  delete p->mapping;
138  p->mapping=NULL;
139  }
140  p->mappingPhase=phase;
141  }
142  return p;
143 }
144 
146  overrideHandling=oh;
147  ++phase;
148 }
149 
151  norms.createNorm(c)->cc=cc;
152  norms.ccSet.add(c);
153 }
154 
155 static UBool isWellFormed(const UnicodeString &s) {
157  u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode);
159 }
160 
162  if(!isWellFormed(m)) {
163  fprintf(stderr,
164  "error in gennorm2 phase %d: "
165  "illegal one-way mapping from U+%04lX to malformed string\n",
166  (int)phase, (long)c);
168  }
170  p->mapping=new UnicodeString(m);
171  p->mappingType=Norm::ONE_WAY;
172  p->setMappingCP();
173  norms.mappingSet.add(c);
174 }
175 
177  if(U_IS_SURROGATE(c)) {
178  fprintf(stderr,
179  "error in gennorm2 phase %d: "
180  "illegal round-trip mapping from surrogate code point U+%04lX\n",
181  (int)phase, (long)c);
183  }
184  if(!isWellFormed(m)) {
185  fprintf(stderr,
186  "error in gennorm2 phase %d: "
187  "illegal round-trip mapping from U+%04lX to malformed string\n",
188  (int)phase, (long)c);
190  }
191  int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length());
192  if(numCP!=2) {
193  fprintf(stderr,
194  "error in gennorm2 phase %d: "
195  "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
196  (int)phase, (long)c, (int)numCP);
198  }
200  p->mapping=new UnicodeString(m);
201  p->mappingType=Norm::ROUND_TRIP;
202  p->mappingCP=U_SENTINEL;
203  norms.mappingSet.add(c);
204 }
205 
207  // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
209  p->mappingType=Norm::REMOVED;
210  norms.mappingSet.add(c);
211 }
212 
214  Norm::MappingType mappingType) const {
215  if(buffer.isEmpty()) {
216  return FALSE; // Maps-to-empty-string is no boundary of any kind.
217  }
218  int32_t lastStarterIndex=buffer.lastStarterIndex();
219  if(lastStarterIndex<0) {
220  return FALSE; // no starter
221  }
222  const int32_t lastIndex=buffer.length()-1;
223  if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
224  // One-way mapping where after the last starter is at least one combining mark
225  // with a combining class greater than 1,
226  // which means that another combining mark can reorder before it.
227  // By contrast, in a round-trip mapping this does not prevent a boundary as long as
228  // the starter or composite does not combine-forward with a following combining mark.
229  return FALSE;
230  }
231  UChar32 starter=buffer.charAt(lastStarterIndex);
232  if(lastStarterIndex==0 && norms.combinesBack(starter)) {
233  // The last starter is at the beginning of the mapping and combines backward.
234  return FALSE;
235  }
236  if(Hangul::isJamoL(starter) ||
237  (Hangul::isJamoV(starter) &&
238  0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
239  // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
240  // otherwise it is blocked.
241  return lastStarterIndex!=lastIndex;
242  }
243  // Note: There can be no Hangul syllable in the fully decomposed mapping.
244 
245  // Multiple starters can combine into one.
246  // Look for the first of the last sequence of starters, excluding Jamos.
247  int32_t i=lastStarterIndex;
248  UChar32 c;
249  while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
250  starter=c;
251  --i;
252  }
253  // Compose as far as possible, and see if further compositions with
254  // characters following this mapping are possible.
255  const Norm *starterNorm=norms.getNorm(starter);
256  if(i==lastStarterIndex &&
257  (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
258  return TRUE; // The last starter does not combine forward.
259  }
260  uint8_t prevCC=0;
261  while(++i<buffer.length()) {
262  uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
263  if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
264  // The starter combines with a mark that reorders before the current one.
265  return FALSE;
266  }
267  UChar32 c=buffer.charAt(i);
268  if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
269  norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
270  // The starter combines with c into a composite replacement starter.
271  starterNorm=norms.getNorm(starter);
272  if(i>=lastStarterIndex &&
273  (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
274  return TRUE; // The composite does not combine further.
275  }
276  // Keep prevCC because we "removed" the combining mark.
277  } else if(cc==0) {
278  starterNorm=norms.getNorm(c);
279  if(i==lastStarterIndex &&
280  (starterNorm==nullptr || starterNorm->compositions==nullptr)) {
281  return TRUE; // The new starter does not combine forward.
282  }
283  prevCC=0;
284  } else {
285  prevCC=cc;
286  }
287  }
288  if(prevCC==0) {
289  return FALSE; // forward-combining starter at the very end
290  }
291  if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
292  // The starter combines with another mark.
293  return FALSE;
294  }
295  return TRUE;
296 }
297 
299  if(buffer.lastStarterIndex()<0) {
300  return FALSE; // no starter
301  }
302  const Norm *starterNorm=nullptr;
303  uint8_t prevCC=0;
304  for(int32_t i=0; i<buffer.length(); ++i) {
305  UChar32 c=buffer.charAt(i);
306  uint8_t cc=buffer.ccAt(i);
307  if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
308  norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
309  return TRUE; // normal composite
310  } else if(cc==0) {
311  if(Hangul::isJamoL(c)) {
312  if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
313  return TRUE; // Hangul syllable
314  }
315  starterNorm=nullptr;
316  } else {
317  starterNorm=norms.getNorm(c);
318  }
319  }
320  prevCC=cc;
321  }
322  return FALSE;
323 }
324 
326  // Prerequisites: Compositions are built, mappings are recursively decomposed.
327  // Mappings are not yet in canonical order.
328  //
329  // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
330  // Therefore, we cannot compute algorithmic mapping deltas here.
331  // Error conditions are checked, but printed later when we do know the offending code point.
332  if(norm.hasMapping()) {
333  if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
334  norm.error="mapping longer than maximum of 31";
335  return;
336  }
337  // Ensure canonical order.
339  if(norm.rawMapping!=nullptr) {
340  norms.reorder(*norm.rawMapping, buffer);
341  buffer.reset();
342  }
343  norms.reorder(*norm.mapping, buffer);
344  if(buffer.isEmpty()) {
345  // A character that is deleted (maps to an empty string) must
346  // get the worst-case lccc and tccc values because arbitrary
347  // characters on both sides will become adjacent.
348  norm.leadCC=1;
349  norm.trailCC=0xff;
350  } else {
351  norm.leadCC=buffer.ccAt(0);
352  norm.trailCC=buffer.ccAt(buffer.length()-1);
353  }
354 
355  norm.hasCompBoundaryBefore=
356  !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
357  norm.hasCompBoundaryAfter=
358  norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
359 
360  if(norm.combinesBack) {
361  norm.error="combines-back and decomposes, not possible in Unicode normalization";
362  } else if(norm.mappingType==Norm::ROUND_TRIP) {
363  if(norm.compositions!=NULL) {
365  } else {
367  }
368  } else { // one-way mapping
369  if(norm.compositions!=NULL) {
370  norm.error="combines-forward and has a one-way mapping, "
371  "not possible in Unicode normalization";
372  } else if(buffer.isEmpty()) {
373  norm.type=Norm::NO_NO_EMPTY;
374  } else if(!norm.hasCompBoundaryBefore) {
376  } else if(mappingRecomposes(buffer)) {
378  } else {
379  // The mapping is comp-normalized.
381  }
382  }
383  } else { // no mapping
384  norm.leadCC=norm.trailCC=norm.cc;
385 
386  norm.hasCompBoundaryBefore=
387  norm.cc==0 && !norm.combinesBack;
388  norm.hasCompBoundaryAfter=
389  norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
390 
391  if(norm.combinesBack) {
392  if(norm.compositions!=nullptr) {
393  // Earlier code checked ccc=0.
395  } else {
396  norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
397  }
398  } else if(norm.compositions!=nullptr) {
399  // Earlier code checked ccc=0.
401  } else if(norm.cc!=0) {
403  } else {
404  norm.type=Norm::INERT;
405  }
406  }
407 }
408 
410 public:
412  Norms::Enumerator(n), builder(b), norm16Trie(trie) {}
415  }
418 };
419 
421  UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
422  smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
423 }
424 
426  if((norm.leadCC|norm.trailCC)!=0) {
427  for(UChar32 c=start; c<=end; ++c) {
428  setSmallFCD(c);
429  }
430  }
431 
432  int32_t norm16;
433  switch(norm.type) {
434  case Norm::INERT:
435  norm16=Normalizer2Impl::INERT;
436  break;
438  norm16=norm.offset*2;
439  break;
442  break;
445  break;
447  norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
448  break;
451  break;
454  break;
455  case Norm::NO_NO_EMPTY:
457  break;
458  case Norm::NO_NO_DELTA:
459  {
460  // Positive offset from minNoNoDelta, shifted left for additional bits.
462  if(norm.trailCC==0) {
463  // DELTA_TCCC_0==0
464  } else if(norm.trailCC==1) {
466  } else {
468  }
469  norm16=getMinNoNoDelta()+offset;
470  break;
471  }
474  break;
476  norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
477  break;
479  U_ASSERT(norm.cc!=0);
480  norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
481  break;
482  default: // Should not occur.
484  }
485  U_ASSERT((norm16&1)==0);
486  if(norm.hasCompBoundaryAfter) {
488  }
489  IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
490  umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode);
491 
492  // Set the minimum code points for real data lookups in the quick check loops.
493  UBool isDecompNo=
495  norm.cc!=0;
498  }
499  UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
502  }
505  }
506 }
507 
509  HangulIterator hi;
511  // Check that none of the Hangul/Jamo code points have data.
512  while((range=hi.nextRange())!=NULL) {
513  for(UChar32 c=range->start; c<=range->end; ++c) {
514  if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) {
515  fprintf(stderr,
516  "gennorm2 error: "
517  "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
518  (long)c);
520  }
521  }
522  }
523  // Set data for algorithmic runtime handling.
524  IcuToolErrorCode errorCode("gennorm2/setHangulData()");
525 
526  // Jamo V/T are maybeYes
529  }
534  // JAMO_T_BASE+1: not U+11A7
537 
538  // Hangul LV encoded as minYesNo
540  // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
545  }
546  // Set the first LV, then write all other Hangul syllables as LVT,
547  // then overwrite the remaining LV.
552  umutablecptrie_set(norm16Trie, c, lv, errorCode);
553  }
554  errorCode.assertSuccess();
555 }
556 
557 LocalUCPTriePointer Normalizer2DataBuilder::processData() {
558  // Build composition lists before recursive decomposition,
559  // so that we still have the raw, pair-wise mappings.
560  CompositionBuilder compBuilder(norms);
561  norms.enumRanges(compBuilder);
562 
563  // Recursively decompose all mappings.
564  Decomposer decomposer(norms);
565  do {
566  decomposer.didDecompose=FALSE;
567  norms.enumRanges(decomposer);
568  } while(decomposer.didDecompose);
569 
570  // Set the Norm::Type and other properties.
571  int32_t normsLength=norms.length();
572  for(int32_t i=1; i<normsLength; ++i) {
574  }
575 
576  // Write the properties, mappings and composition lists to
577  // appropriate parts of the "extra data" array.
579  norms.enumRanges(extra);
580 
585  extraData.append(extra.yesNoMappingsOnly);
587  extraData.append(extra.noNoMappingsCompYes);
591  extraData.append(extra.noNoMappingsCompNoMaybeCC);
593  extraData.append(extra.noNoMappingsEmpty);
595 
596  // Pad the maybeYesCompositions length to a multiple of 4,
597  // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
598  while(extra.maybeYesCompositions.length()&3) {
599  extra.maybeYesCompositions.append((UChar)0);
600  }
601  extraData.insert(0, extra.maybeYesCompositions);
604  extra.maybeYesCompositions.length()*2;
605 
606  // Pad to even length for 4-byte alignment of following data.
607  if(extraData.length()&1) {
608  extraData.append((UChar)0);
609  }
610 
611  int32_t minNoNoDelta=getMinNoNoDelta();
612  U_ASSERT((minNoNoDelta&7)==0);
613  if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
614  fprintf(stderr,
615  "gennorm2 error: "
616  "data structure overflow, too much mapping composition data\n");
618  }
619 
620  // writeNorm16() and setHangulData() reduce these as needed.
624 
625  IcuToolErrorCode errorCode("gennorm2/processData()");
626  UMutableCPTrie *norm16Trie = umutablecptrie_open(
628  errorCode.assertSuccess();
629 
630  // Map each code point to its norm16 value,
631  // including the properties that fit directly,
632  // and the offset to the "extra data" if necessary.
633  Norm16Writer norm16Writer(norm16Trie, norms, *this);
634  norms.enumRanges(norm16Writer);
635  // TODO: iterate via getRange() instead of callback?
636 
637  setHangulData(norm16Trie);
638 
639  // Look for the "worst" norm16 value of any supplementary code point
640  // corresponding to a lead surrogate, and set it as that surrogate's value.
641  // Enables UTF-16 quick check inner loops to look at only code units.
642  //
643  // We could be more sophisticated:
644  // We could collect a bit set for whether there are values in the different
645  // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
646  // and select the best value that only breaks the composition and/or decomposition
647  // inner loops if necessary.
648  // However, that seems like overkill for an optimization for supplementary characters.
649  //
650  // First check that surrogate code *points* are inert.
651  // The parser should have rejected values/mappings for them.
652  uint32_t value;
653  UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
654  nullptr, nullptr, &value);
655  if (value != Normalizer2Impl::INERT || end < 0xdfff) {
656  fprintf(stderr,
657  "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n",
658  (int)end, (long)value);
660  }
661  uint32_t maxNorm16 = 0;
662  // ANDing values yields 0 bits where any value has a 0.
663  // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
664  uint32_t andedNorm16 = 0;
665  end = 0;
666  for (UChar32 start = 0x10000;;) {
667  if (start > end) {
669  nullptr, nullptr, &value);
670  if (end < 0) { break; }
671  }
672  if ((start & 0x3ff) == 0) {
673  // Data for a new lead surrogate.
674  maxNorm16 = andedNorm16 = value;
675  } else {
676  if (value > maxNorm16) {
677  maxNorm16 = value;
678  }
679  andedNorm16 &= value;
680  }
681  // Intersect each range with the code points for one lead surrogate.
682  UChar32 leadEnd = start | 0x3ff;
683  if (leadEnd <= end) {
684  // End of the supplementary block for a lead surrogate.
685  if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) {
686  // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
687  // Otherwise it might end up at something like JAMO_VT which stays in
688  // the inner decomposition quick check loop.
690  }
691  maxNorm16 =
692  (maxNorm16 & ~~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
694  if (maxNorm16 != Normalizer2Impl::INERT) {
695  umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode);
696  }
697  if (value == Normalizer2Impl::INERT) {
698  // Potentially skip inert supplementary blocks for several lead surrogates.
699  start = (end + 1) & ~0x3ff;
700  } else {
701  start = leadEnd + 1;
702  }
703  } else {
704  start = end + 1;
705  }
706  }
707 
708  // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
709  // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
710  // which is harmless.
711  // As a result, the minimum code points are always BMP code points.
713  if(minCP>=0x10000) {
715  }
717  if(minCP>=0x10000) {
719  }
721  if(minCP>=0x10000) {
723  }
724 
725  LocalUCPTriePointer builtTrie(
727  norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode);
729  fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n",
730  errorCode.errorName());
731  exit(errorCode.reset());
732  }
733  umutablecptrie_close(norm16Trie);
734  errorCode.reset();
737  errorCode.assertSuccess();
738 
739  int32_t offset=(int32_t)sizeof(indexes);
743  offset+=extraData.length()*2;
745  offset+=sizeof(smallFCD);
746  int32_t totalSize=offset;
748  indexes[i]=totalSize;
749  }
750 
751  if(beVerbose) {
752  printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength);
753  printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length());
754  printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD));
755  printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
756  printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
757  printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
758  printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
759  printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
760  printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
761  printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
762  printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
763  printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
764  printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
765  printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
766  printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
767  printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
768  }
769 
770  UVersionInfo nullVersion={ 0, 0, 0, 0 };
771  if(0==memcmp(nullVersion, unicodeVersion, 4)) {
773  }
775  return builtTrie;
776 }
777 
779  processData();
780 
781  IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
782  UNewDataMemory *pData=
785  if(errorCode.isFailure()) {
786  fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
787  filename, errorCode.errorName());
788  exit(errorCode.reset());
789  }
790  udata_writeBlock(pData, indexes, sizeof(indexes));
792  udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length());
793  udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
794  int32_t writtenSize=udata_finish(pData, errorCode);
795  if(errorCode.isFailure()) {
796  fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
797  exit(errorCode.reset());
798  }
800  if(writtenSize!=totalSize) {
801  fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
802  (long)writtenSize, (long)totalSize);
804  }
805 }
806 
807 void
809  LocalUCPTriePointer norm16Trie = processData();
810 
811  IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
812  const char *basename=findBasename(filename);
814  CharString dataName(basename, errorCode);
815  const char *extension=strrchr(basename, '.');
816  if(extension!=NULL) {
817  dataName.truncate((int32_t)(extension-basename));
818  }
819  const char *name=dataName.data();
820  errorCode.assertSuccess();
821 
822  FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp");
823  if(f==NULL) {
824  fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
825  filename);
827  }
828  fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
829 
830  char line[100];
831  sprintf(line, "static const UVersionInfo %s_formatVersion={", name);
832  usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
833  sprintf(line, "static const UVersionInfo %s_dataVersion={", name);
834  usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
835  sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name);
837 
838  usrc_writeUCPTrie(f, name, norm16Trie.getAlias());
839 
840  sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", name);
841  usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "\n};\n\n");
842  sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", name);
843  usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n");
844 
845  fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
846  fclose(f);
847 }
848 
849 namespace {
850 
851 bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
852  if(s1 == nullptr) {
853  return s2 == nullptr;
854  } else if(s2 == nullptr) {
855  return false;
856  } else {
857  return *s1 == *s2;
858  }
859 }
860 
861 const char *typeChars = "?-=>";
862 
863 void writeMapping(FILE *f, const UnicodeString *m) {
864  if(m != nullptr && !m->isEmpty()) {
865  int32_t i = 0;
866  UChar32 c = m->char32At(i);
867  fprintf(f, "%04lX", (long)c);
868  while((i += U16_LENGTH(c)) < m->length()) {
869  c = m->char32At(i);
870  fprintf(f, " %04lX", (long)c);
871  }
872  }
873  fputs("\n", f);
874 }
875 
876 } // namespace
877 
878 void
879 Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
880  // Do not processData() before writing the input-syntax data file.
881  FILE *f = fopen(filename, "w");
882  if(f == nullptr) {
883  fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
884  filename);
886  return;
887  }
888 
889  if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
890  unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
893  fprintf(f, "* Unicode %s\n\n", uv);
894  }
895 
896  UnicodeSetIterator ccIter(norms.ccSet);
899  uint8_t prevCC = 0;
900  bool done = false;
901  bool didWrite = false;
902  do {
903  UChar32 c;
904  uint8_t cc;
905  if(ccIter.next() && !ccIter.isString()) {
906  c = ccIter.getCodepoint();
907  cc = norms.getCC(c);
908  } else {
909  c = 0x110000;
910  cc = 0;
911  done = true;
912  }
913  if(cc == prevCC && c == (end + 1)) {
914  end = c;
915  } else {
916  if(prevCC != 0) {
917  if(start == end) {
918  fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
919  } else {
920  fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
921  }
922  didWrite = true;
923  }
924  start = end = c;
925  prevCC = cc;
926  }
927  } while(!done);
928  if(didWrite) {
929  fputs("\n", f);
930  }
931 
932  UnicodeSetIterator mIter(norms.mappingSet);
933  start = U_SENTINEL;
934  end = U_SENTINEL;
935  const UnicodeString *prevMapping = nullptr;
936  Norm::MappingType prevType = Norm::NONE;
937  done = false;
938  do {
939  UChar32 c;
940  const Norm *norm;
941  if(mIter.next() && !mIter.isString()) {
942  c = mIter.getCodepoint();
943  norm = norms.getNorm(c);
944  } else {
945  c = 0x110000;
946  norm = nullptr;
947  done = true;
948  }
949  const UnicodeString *mapping;
951  if(norm == nullptr) {
952  mapping = nullptr;
953  type = Norm::NONE;
954  } else {
955  type = norm->mappingType;
956  if(type == Norm::NONE) {
957  mapping = nullptr;
958  } else {
959  mapping = norm->mapping;
960  }
961  }
962  if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
963  end = c;
964  } else {
965  if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
966  if(start == end) {
967  fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
968  } else {
969  fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
970  }
971  writeMapping(f, prevMapping);
972  }
973  start = end = c;
974  prevMapping = mapping;
975  prevType = type;
976  }
977  } while(!done);
978 
979  fclose(f);
980 }
981 
982 void
984  const Normalizer2DataBuilder &b2,
986  // Compute diff = b1 - b2
987  // so that we should be able to get b1 = b2 + diff.
988  if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
989  memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
990  }
991 
992  UnicodeSet ccSet(b1.norms.ccSet);
993  ccSet.addAll(b2.norms.ccSet);
994  UnicodeSetIterator ccIter(ccSet);
995  while(ccIter.next() && !ccIter.isString()) {
996  UChar32 c = ccIter.getCodepoint();
997  uint8_t cc1 = b1.norms.getCC(c);
998  uint8_t cc2 = b2.norms.getCC(c);
999  if(cc1 != cc2) {
1000  diff.setCC(c, cc1);
1001  }
1002  }
1003 
1004  UnicodeSet mSet(b1.norms.mappingSet);
1005  mSet.addAll(b2.norms.mappingSet);
1006  UnicodeSetIterator mIter(mSet);
1007  while(mIter.next() && !mIter.isString()) {
1008  UChar32 c = mIter.getCodepoint();
1009  const Norm *norm1 = b1.norms.getNorm(c);
1010  const Norm *norm2 = b2.norms.getNorm(c);
1011  const UnicodeString *mapping1;
1012  Norm::MappingType type1;
1013  if(norm1 == nullptr || !norm1->hasMapping()) {
1014  mapping1 = nullptr;
1015  type1 = Norm::NONE;
1016  } else {
1017  mapping1 = norm1->mapping;
1018  type1 = norm1->mappingType;
1019  }
1020  const UnicodeString *mapping2;
1021  Norm::MappingType type2;
1022  if(norm2 == nullptr || !norm2->hasMapping()) {
1023  mapping2 = nullptr;
1024  type2 = Norm::NONE;
1025  } else {
1026  mapping2 = norm2->mapping;
1027  type2 = norm2->mappingType;
1028  }
1029  if(type1 == type2 && equalStrings(mapping1, mapping2)) {
1030  // Nothing to do.
1031  } else if(type1 == Norm::NONE) {
1032  diff.removeMapping(c);
1033  } else if(type1 == Norm::ROUND_TRIP) {
1034  diff.setRoundTripMapping(c, *mapping1);
1035  } else if(type1 == Norm::ONE_WAY) {
1036  diff.setOneWayMapping(c, *mapping1);
1037  }
1038  }
1039 }
1040 
1042 
1043 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1044 
1045 /*
1046  * Hey, Emacs, please set the following:
1047  *
1048  * Local Variables:
1049  * indent-tabs-mode: nil
1050  * End:
1051  */
#define type(a)
Definition: aptex-macros.h:171
#define name
const char * data() const
Definition: charstr.h:83
CharString & truncate(int32_t newLength)
Definition: charstr.cpp:92
UBool didDecompose
Definition: norms.h:208
UnicodeString yesYesCompositions
Definition: extradata.h:36
UnicodeString noNoMappingsCompYes
Definition: extradata.h:39
UnicodeString noNoMappingsCompBoundaryBefore
Definition: extradata.h:40
UnicodeString yesNoMappingsOnly
Definition: extradata.h:38
UnicodeString noNoMappingsEmpty
Definition: extradata.h:42
UnicodeString noNoMappingsCompNoMaybeCC
Definition: extradata.h:41
UnicodeString maybeYesCompositions
Definition: extradata.h:35
UnicodeString yesNoMappingsAndCompositions
Definition: extradata.h:37
static const Range ranges[4]
Definition: n2builder.cpp:83
const Range * nextRange()
Definition: n2builder.cpp:75
int32_t rangeIndex
Definition: n2builder.cpp:84
static UBool isJamoV(UChar32 c)
static UBool isJamo(UChar32 c)
static UBool isJamoL(UChar32 c)
UMutableCPTrie * norm16Trie
Definition: n2builder.cpp:417
Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b)
Definition: n2builder.cpp:411
Normalizer2DataBuilder & builder
Definition: n2builder.cpp:416
void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override
Definition: n2builder.cpp:413
void setRoundTripMapping(UChar32 c, const UnicodeString &m)
Definition: n2builder.cpp:176
int32_t indexes[Normalizer2Impl::IX_COUNT]
Definition: n2builder.h:109
void setOneWayMapping(UChar32 c, const UnicodeString &m)
Definition: n2builder.cpp:161
UnicodeString extraData
Definition: n2builder.h:112
void setHangulData(UMutableCPTrie *norm16Trie)
Definition: n2builder.cpp:508
void setUnicodeVersion(const char *v)
Definition: n2builder.cpp:109
int32_t getMinNoNoDelta() const
Definition: n2builder.h:94
void postProcess(Norm &norm)
Definition: n2builder.cpp:325
void removeMapping(UChar32 c)
Definition: n2builder.cpp:206
void writeCSourceFile(const char *filename)
Definition: n2builder.cpp:808
void writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm)
Definition: n2builder.cpp:425
void setCC(UChar32 c, uint8_t cc)
Definition: n2builder.cpp:150
void setOverrideHandling(OverrideHandling oh)
Definition: n2builder.cpp:145
UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const
Definition: n2builder.cpp:298
void setSmallFCD(UChar32 c)
Definition: n2builder.cpp:420
Normalizer2DataBuilder(UErrorCode &errorCode)
Definition: n2builder.cpp:95
LocalUCPTriePointer processData()
Definition: n2builder.cpp:557
static void computeDiff(const Normalizer2DataBuilder &b1, const Normalizer2DataBuilder &b2, Normalizer2DataBuilder &diff)
Definition: n2builder.cpp:983
Norm * checkNormForMapping(Norm *p, UChar32 c)
Definition: n2builder.cpp:125
UVersionInfo unicodeVersion
Definition: n2builder.h:115
Optimization optimization
Definition: n2builder.h:107
uint8_t smallFCD[0x100]
Definition: n2builder.h:113
UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, Norm::MappingType mappingType) const
Definition: n2builder.cpp:213
uint8_t * norm16TrieBytes
Definition: n2builder.h:110
void writeDataFile(const char *filename, Bool__ writeRemoved) const
Definition: n2builder.cpp:879
OverrideHandling overrideHandling
Definition: n2builder.h:105
void writeBinaryFile(const char *filename)
Definition: n2builder.cpp:778
Definition: norms.h:146
Norm * createNorm(UChar32 c)
Definition: norms.cpp:114
int32_t length() const
Definition: norms.h:151
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const
Definition: norms.cpp:127
uint8_t getCC(UChar32 c) const
Definition: norms.h:163
const Norm & getNormRefByIndex(int32_t i) const
Definition: norms.h:152
UnicodeSet ccSet
Definition: norms.h:185
UnicodeSet mappingSet
Definition: norms.h:185
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const
Definition: norms.cpp:142
UBool combinesBack(UChar32 c) const
Definition: norms.h:164
const Norm & getNormRef(UChar32 c) const
Definition: norms.cpp:110
Norm * getNorm(UChar32 c)
Definition: norms.cpp:94
void enumRanges(Enumerator &e)
Definition: norms.cpp:156
#define UPRV_LENGTHOF(array)
Definition: cmemory.h:50
#define n
Definition: t4ht.c:1290
#define b
Definition: jpegint.h:372
@ FALSE
Definition: dd.h:101
@ TRUE
Definition: dd.h:102
#define strrchr
Definition: detex.c:67
int v
Definition: dviconv.c:10
#define fopen
Definition: xxstdio.h:21
int printf()
C++ API: ErrorCode class intended to make it easier to use ICU C and C++ APIs from C++ user code.
mpz_t * f
Definition: gen-fib.c:34
#define s
Definition: afcover.h:80
U_NAMESPACE_BEGIN UBool beVerbose
Definition: gennorm2.cpp:47
U_NAMESPACE_BEGIN UBool haveCopyright
Definition: gennorm2.cpp:47
#define c(n)
Definition: gpos-common.c:150
#define memcmp(s1, s2, n)
Definition: gsftopk.c:66
#define memcpy(d, s, n)
Definition: gsftopk.c:64
unsigned char UChar
Definition: bzip2.c:163
#define NULL
Definition: ftobjs.h:61
small capitals from c petite p
Definition: afcover.h:72
small capitals from c petite p scientific i
Definition: afcover.h:80
void exit()
unsigned int uint32_t
Definition: stdint.h:80
signed int int32_t
Definition: stdint.h:77
unsigned char uint8_t
Definition: stdint.h:78
static int phase
Definition: cs_type2.c:84
#define basename
Definition: dvi.c:33
#define fclose
Definition: debug.h:100
#define fputs
Definition: mendex.h:67
#define fprintf
Definition: mendex.h:64
#define b1
Definition: texmfmem.h:169
#define b2
Definition: texmfmem.h:170
#define sprintf
Definition: snprintf.c:44
#define U_IS_BIG_ENDIAN
Definition: platform.h:403
#define U_CHARSET_FAMILY
Definition: platform.h:632
#define extension(a0)
Definition: tif_fax3.c:201
C++ API: "Smart pointers" for use with and in ICU4C C++ code.
static UBool isWellFormed(const UnicodeString &s)
Definition: n2builder.cpp:155
static UDataInfo dataInfo
Definition: n2builder.cpp:52
#define version
Definition: nup.c:10
union value value
Definition: obx.h:44
char * filename[256]
Definition: pbmtopk.c:46
static int offset
Definition: ppmtogif.c:642
bstring c int memset(void *s, int c, int length)
C API: Unicode string handling functions.
C API: Platform Utilities.
#define uint32_t
Definition: stdint.in.h:168
#define int32_t
Definition: stdint.in.h:167
#define uint8_t
Definition: stdint.in.h:154
Definition: norms.h:58
UChar32 combine(UChar32 trail) const
Definition: norms.cpp:55
UnicodeString * mapping
Definition: norms.h:84
MappingType mappingType
Definition: norms.h:88
@ YES_NO_COMBINES_FWD
Definition: norms.h:113
@ NO_NO_COMP_YES
Definition: norms.h:117
@ MAYBE_YES_COMBINES_FWD
Definition: norms.h:130
@ YES_NO_MAPPING_ONLY
Definition: norms.h:115
@ NO_NO_COMP_BOUNDARY_BEFORE
Definition: norms.h:119
@ NO_NO_EMPTY
Definition: norms.h:123
@ YES_YES_WITH_CC
Definition: norms.h:134
@ NO_NO_COMP_NO_MAYBE_CC
Definition: norms.h:121
@ MAYBE_YES_SIMPLE
Definition: norms.h:132
@ YES_YES_COMBINES_FWD
Definition: norms.h:111
@ INERT
Definition: norms.h:109
@ NO_NO_DELTA
Definition: norms.h:125
MappingType
Definition: norms.h:59
@ NONE
Definition: norms.h:59
@ REMOVED
Definition: norms.h:59
@ ROUND_TRIP
Definition: norms.h:59
@ ONE_WAY
Definition: norms.h:59
UBool hasMapping() const
Definition: norms.h:61
UBool combinesBack
Definition: norms.h:92
uint8_t cc
Definition: norms.h:91
UVector32 * compositions
Definition: norms.h:90
uint8_t formatVersion[4]
Definition: udata.h:148
uint8_t dataVersion[4]
Definition: udata.h:152
Definition: utils.c:300
size_t length
Definition: utils.c:302
Definition: bdf.c:133
Definition: tpic.c:45
pointer path
Definition: t1imager.h:36
#define FILE
Definition: t1stdio.h:34
s1
Definition: t4ht.c:1059
char * s2
Definition: t4ht.c:1062
int diff
Definition: tex4ht.c:3815
m
Definition: tex4ht.c:3990
int mapping
Definition: ttf2pfb.c:116
#define U_ASSERT(exp)
Definition: uassert.h:37
#define U_UNICODE_VERSION
Definition: uchar.h:63
@ UCPMAP_RANGE_NORMAL
Definition: ucpmap.h:48
@ UCPTRIE_TYPE_FAST
Definition: ucptrie.h:131
@ UCPTRIE_VALUE_BITS_16
Definition: ucptrie.h:162
static const char * findBasename(const char *path)
Definition: udata.cpp:225
C API: Data loading interface.
int32_t UChar32
Definition: umachine.h:467
int8_t UBool
Definition: umachine.h:269
#define U_OVERRIDE
Definition: umachine.h:130
#define U_SENTINEL
Definition: umachine.h:487
#define U_SIZEOF_UCHAR
Definition: umachine.h:358
struct UMutableCPTrie UMutableCPTrie
uint32_t udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode)
Definition: unewdata.cpp:154
void udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length)
Definition: unewdata.cpp:227
UNewDataMemory * udata_create(const char *dir, const char *type, const char *name, const UDataInfo *pInfo, const char *comment, UErrorCode *pErrorCode)
Definition: unewdata.cpp:36
void udata_writeUString(UNewDataMemory *pData, const UChar *s, int32_t length)
Definition: unewdata.cpp:267
Definition: obx.h:51
C++ API: Unicode String.
#define umutablecptrie_buildImmutable
Definition: urename.h:1167
#define umutablecptrie_open
Definition: urename.h:1174
#define u_versionToString
Definition: urename.h:410
#define umutablecptrie_set
Definition: urename.h:1175
#define u_versionFromString
Definition: urename.h:408
#define ucptrie_toBinary
Definition: urename.h:800
#define umutablecptrie_close
Definition: urename.h:1169
#define umutablecptrie_get
Definition: urename.h:1172
#define u_countChar32
Definition: urename.h:222
#define umutablecptrie_getRange
Definition: urename.h:1173
#define umutablecptrie_setRange
Definition: urename.h:1176
#define u_strToUTF8
Definition: urename.h:371
#define U16_LEAD(supplementary)
Definition: utf16.h:123
#define U16_LENGTH(c)
Definition: utf16.h:141
#define U_IS_SURROGATE(c)
Definition: utf.h:193
@ start
Definition: preamble.c:52
@ range
Definition: preamble.c:52
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Definition: utypes.h:431
@ U_BUFFER_OVERFLOW_ERROR
Definition: utypes.h:481
@ U_ILLEGAL_ARGUMENT_ERROR
Definition: utypes.h:467
@ U_INVALID_FORMAT_ERROR
Definition: utypes.h:469
@ U_FILE_ACCESS_ERROR
Definition: utypes.h:470
@ U_ZERO_ERROR
Definition: utypes.h:465
@ U_INTERNAL_PROGRAM_ERROR
Definition: utypes.h:471
#define U_SUCCESS(x)
Definition: utypes.h:730
#define U_COPYRIGHT_STRING
Definition: uvernum.h:56
uint8_t UVersionInfo[4]
Definition: uversion.h:59
#define U_MAX_VERSION_LENGTH
Definition: uversion.h:43
#define U_MAX_VERSION_STRING_LENGTH
Definition: uversion.h:53
FILE * usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator)
Definition: writesrc.cpp:75
void usrc_writeArray(FILE *f, const char *prefix, const void *p, int32_t width, int32_t length, const char *postfix)
Definition: writesrc.cpp:122
void usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie)
Definition: writesrc.cpp:293
#define nullptr
Definition: xetex.h:75
#define buffer
Definition: xmlparse.c:611
#define errorCode
Definition: xmlparse.c:601
#define end(cp)
Definition: zic.c:71