w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

ucm.cpp
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucm.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003jun20
16 * created by: Markus W. Scherer
17 *
18 * This file reads a .ucm file, stores its mappings and sorts them.
19 * It implements handling of Unicode conversion mappings from .ucm files
20 * for makeconv, canonucm, rptp2ucm, etc.
21 *
22 * Unicode code point sequences with a length of more than 1,
23 * as well as byte sequences with more than 4 bytes or more than one complete
24 * character sequence are handled to support m:n mappings.
25 */
26 
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "filestrm.h"
32 #include "uarrsort.h"
33 #include "ucnvmbcs.h"
34 #include "ucnv_bld.h"
35 #include "ucnv_ext.h"
36 #include "uparse.h"
37 #include "ucm.h"
38 #include <stdio.h>
39 
40 #if !UCONFIG_NO_CONVERSION
41 
42 /* -------------------------------------------------------------------------- */
43 
44 static void
46  int32_t j;
47 
48  for(j=0; j<m->uLen; ++j) {
49  fprintf(f, "<U%04lX>", (long)codePoints[j]);
50  }
51 
52  fputc(' ', f);
53 
54  for(j=0; j<m->bLen; ++j) {
55  fprintf(f, "\\x%02X", bytes[j]);
56  }
57 
58  if(m->f>=0) {
59  fprintf(f, " |%u\n", m->f);
60  } else {
61  fputs("\n", f);
62  }
63 }
64 
65 U_CAPI void U_EXPORT2
68 }
69 
70 U_CAPI void U_EXPORT2
72  UCMapping *m;
73  int32_t i, length;
74 
75  m=table->mappings;
76  length=table->mappingsLength;
77  if(byUnicode) {
78  for(i=0; i<length; ++m, ++i) {
80  }
81  } else {
82  const int32_t *map=table->reverseMap;
83  for(i=0; i<length; ++i) {
85  }
86  }
87 }
88 
89 /* mapping comparisons ------------------------------------------------------ */
90 
91 static int32_t
93  UCMTable *rTable, const UCMapping *r) {
94  const UChar32 *lu, *ru;
96 
97  if(l->uLen==1 && r->uLen==1) {
98  /* compare two single code points */
99  return l->u-r->u;
100  }
101 
102  /* get pointers to the code point sequences */
103  lu=UCM_GET_CODE_POINTS(lTable, l);
104  ru=UCM_GET_CODE_POINTS(rTable, r);
105 
106  /* get the minimum length */
107  if(l->uLen<=r->uLen) {
108  length=l->uLen;
109  } else {
110  length=r->uLen;
111  }
112 
113  /* compare the code points */
114  for(i=0; i<length; ++i) {
115  result=lu[i]-ru[i];
116  if(result!=0) {
117  return result;
118  }
119  }
120 
121  /* compare the lengths */
122  return l->uLen-r->uLen;
123 }
124 
125 static int32_t
127  UCMTable *rTable, const UCMapping *r,
128  UBool lexical) {
129  const uint8_t *lb, *rb;
131 
132  /*
133  * A lexical comparison is used for sorting in the builder, to allow
134  * an efficient search for a byte sequence that could be a prefix
135  * of a previously entered byte sequence.
136  *
137  * Comparing by lengths first is for compatibility with old .ucm tools
138  * like canonucm and rptp2ucm.
139  */
140  if(lexical) {
141  /* get the minimum length and continue */
142  if(l->bLen<=r->bLen) {
143  length=l->bLen;
144  } else {
145  length=r->bLen;
146  }
147  } else {
148  /* compare lengths first */
149  result=l->bLen-r->bLen;
150  if(result!=0) {
151  return result;
152  } else {
153  length=l->bLen;
154  }
155  }
156 
157  /* get pointers to the byte sequences */
158  lb=UCM_GET_BYTES(lTable, l);
159  rb=UCM_GET_BYTES(rTable, r);
160 
161  /* compare the bytes */
162  for(i=0; i<length; ++i) {
163  result=lb[i]-rb[i];
164  if(result!=0) {
165  return result;
166  }
167  }
168 
169  /* compare the lengths */
170  return l->bLen-r->bLen;
171 }
172 
173 /* compare UCMappings for sorting */
174 static int32_t
176  UCMTable *rTable, const UCMapping *r,
177  UBool uFirst) {
178  int32_t result;
179 
180  /* choose which side to compare first */
181  if(uFirst) {
182  /* Unicode then bytes */
183  result=compareUnicode(lTable, l, rTable, r);
184  if(result==0) {
185  result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
186  }
187  } else {
188  /* bytes then Unicode */
189  result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
190  if(result==0) {
191  result=compareUnicode(lTable, l, rTable, r);
192  }
193  }
194 
195  if(result!=0) {
196  return result;
197  }
198 
199  /* compare the flags */
200  return l->f-r->f;
201 }
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t U_CALLCONV
205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206  return compareMappings(
207  (UCMTable *)context, (const UCMapping *)left,
208  (UCMTable *)context, (const UCMapping *)right, TRUE);
209 }
210 
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t U_CALLCONV
213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
215  int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216  return compareMappings(
217  table, table->mappings+l,
218  table, table->mappings+r, FALSE);
219 }
221 
222 U_CAPI void U_EXPORT2
225  int32_t i;
226 
227  if(t->isSorted) {
228  return;
229  }
230 
232 
233  /* 1. sort by Unicode first */
234  uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
236  FALSE, &errorCode);
237 
238  /* build the reverseMap */
239  if(t->reverseMap==NULL) {
240  /*
241  * allocate mappingsCapacity instead of mappingsLength so that
242  * if mappings are added, the reverseMap need not be
243  * reallocated each time
244  * (see ucm_moveMappings() and ucm_addMapping())
245  */
246  t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
247  if(t->reverseMap==NULL) {
248  fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
250  }
251  }
252  for(i=0; i<t->mappingsLength; ++i) {
253  t->reverseMap[i]=i;
254  }
255 
256  /* 2. sort reverseMap by mappings bytes first */
257  uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
259  FALSE, &errorCode);
260 
261  if(U_FAILURE(errorCode)) {
262  fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
264  exit(errorCode);
265  }
266 
267  t->isSorted=TRUE;
268 }
269 
270 /*
271  * remove mappings with their move flag set from the base table
272  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
273  */
274 U_CAPI void U_EXPORT2
276  UCMapping *mb, *mbLimit;
277  int8_t flag;
278 
279  mb=base->mappings;
280  mbLimit=mb+base->mappingsLength;
281 
282  while(mb<mbLimit) {
283  flag=mb->moveFlag;
284  if(flag!=0) {
285  /* reset the move flag */
286  mb->moveFlag=0;
287 
288  if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
289  /* add the mapping to the extension table */
291  }
292 
293  /* remove this mapping: move the last base mapping down and overwrite the current one */
294  if(mb<(mbLimit-1)) {
295  uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
296  }
297  --mbLimit;
298  --base->mappingsLength;
299  base->isSorted=FALSE;
300  } else {
301  ++mb;
302  }
303  }
304 }
305 
306 enum {
308  HAS_ERRORS=2
309 };
310 
311 static uint8_t
313  UBool moveToExt, UBool intersectBase) {
314  (void)baseStates;
315 
316  UCMapping *mb, *me, *mbLimit, *meLimit;
317  int32_t cmp;
318  uint8_t result;
319 
320  mb=base->mappings;
321  mbLimit=mb+base->mappingsLength;
322 
323  me=ext->mappings;
324  meLimit=me+ext->mappingsLength;
325 
326  result=0;
327 
328  for(;;) {
329  /* skip irrelevant mappings on both sides */
330  for(;;) {
331  if(mb==mbLimit) {
332  return result;
333  }
334 
335  if((0<=mb->f && mb->f<=2) || mb->f==4) {
336  break;
337  }
338 
339  ++mb;
340  }
341 
342  for(;;) {
343  if(me==meLimit) {
344  return result;
345  }
346 
347  if((0<=me->f && me->f<=2) || me->f==4) {
348  break;
349  }
350 
351  ++me;
352  }
353 
354  /* compare the base and extension mappings */
355  cmp=compareUnicode(base, mb, ext, me);
356  if(cmp<0) {
357  if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
358  /*
359  * mapping in base but not in ext, move it
360  *
361  * if ext is DBCS, move DBCS mappings here
362  * and check SBCS ones for Unicode prefix below
363  */
366 
367  /* does mb map from an input sequence that is a prefix of me's? */
368  } else if( mb->uLen<me->uLen &&
370  ) {
371  if(moveToExt) {
372  /* mark this mapping to be moved to the extension table */
375  } else {
376  fprintf(stderr,
377  "ucm error: the base table contains a mapping whose input sequence\n"
378  " is a prefix of the input sequence of an extension mapping\n");
379  ucm_printMapping(base, mb, stderr);
380  ucm_printMapping(ext, me, stderr);
382  }
383  }
384 
385  ++mb;
386  } else if(cmp==0) {
387  /*
388  * same output: remove the extension mapping,
389  * otherwise treat as an error
390  */
391  if( mb->f==me->f && mb->bLen==me->bLen &&
392  0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
393  ) {
396  } else if(intersectBase) {
397  /* mapping in base but not in ext, move it */
400  } else {
401  fprintf(stderr,
402  "ucm error: the base table contains a mapping whose input sequence\n"
403  " is the same as the input sequence of an extension mapping\n"
404  " but it maps differently\n");
405  ucm_printMapping(base, mb, stderr);
406  ucm_printMapping(ext, me, stderr);
408  }
409 
410  ++mb;
411  } else /* cmp>0 */ {
412  ++me;
413  }
414  }
415 }
416 
417 static uint8_t
419  UBool moveToExt, UBool intersectBase) {
420  UCMapping *mb, *me;
421  int32_t *baseMap, *extMap;
422  int32_t b, e, bLimit, eLimit, cmp;
423  uint8_t result;
424  UBool isSISO;
425 
426  baseMap=base->reverseMap;
427  extMap=ext->reverseMap;
428 
429  b=e=0;
430  bLimit=base->mappingsLength;
431  eLimit=ext->mappingsLength;
432 
433  result=0;
434 
435  isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
436 
437  for(;;) {
438  /* skip irrelevant mappings on both sides */
439  for(;; ++b) {
440  if(b==bLimit) {
441  return result;
442  }
443  mb=base->mappings+baseMap[b];
444 
445  if(intersectBase==2 && mb->bLen==1) {
446  /*
447  * comparing a base against a DBCS extension:
448  * leave SBCS base mappings alone
449  */
450  continue;
451  }
452 
453  if(mb->f==0 || mb->f==3) {
454  break;
455  }
456  }
457 
458  for(;;) {
459  if(e==eLimit) {
460  return result;
461  }
462  me=ext->mappings+extMap[e];
463 
464  if(me->f==0 || me->f==3) {
465  break;
466  }
467 
468  ++e;
469  }
470 
471  /* compare the base and extension mappings */
472  cmp=compareBytes(base, mb, ext, me, TRUE);
473  if(cmp<0) {
474  if(intersectBase) {
475  /* mapping in base but not in ext, move it */
478 
479  /*
480  * does mb map from an input sequence that is a prefix of me's?
481  * for SI/SO tables, a single byte is never a prefix because it
482  * occurs in a separate single-byte state
483  */
484  } else if( mb->bLen<me->bLen &&
485  (!isSISO || mb->bLen>1) &&
486  0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
487  ) {
488  if(moveToExt) {
489  /* mark this mapping to be moved to the extension table */
492  } else {
493  fprintf(stderr,
494  "ucm error: the base table contains a mapping whose input sequence\n"
495  " is a prefix of the input sequence of an extension mapping\n");
496  ucm_printMapping(base, mb, stderr);
497  ucm_printMapping(ext, me, stderr);
499  }
500  }
501 
502  ++b;
503  } else if(cmp==0) {
504  /*
505  * same output: remove the extension mapping,
506  * otherwise treat as an error
507  */
508  if( mb->f==me->f && mb->uLen==me->uLen &&
510  ) {
513  } else if(intersectBase) {
514  /* mapping in base but not in ext, move it */
517  } else {
518  fprintf(stderr,
519  "ucm error: the base table contains a mapping whose input sequence\n"
520  " is the same as the input sequence of an extension mapping\n"
521  " but it maps differently\n");
522  ucm_printMapping(base, mb, stderr);
523  ucm_printMapping(ext, me, stderr);
525  }
526 
527  ++b;
528  } else /* cmp>0 */ {
529  ++e;
530  }
531  }
532 }
533 
536  UCMapping *m, *mLimit;
537  int32_t count;
538  UBool isOK;
539 
540  m=table->mappings;
541  mLimit=m+table->mappingsLength;
542  isOK=TRUE;
543 
544  while(m<mLimit) {
545  count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
546  if(count<1) {
547  ucm_printMapping(table, m, stderr);
548  isOK=FALSE;
549  }
550  ++m;
551  }
552 
553  return isOK;
554 }
555 
558  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
559  UBool intersectBase) {
560  uint8_t result;
561 
562  /* if we have an extension table, we must always use precision flags */
563  if(base->flagsType&UCM_FLAGS_IMPLICIT) {
564  fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
565  return FALSE;
566  }
567  if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
568  fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
569  return FALSE;
570  }
571 
572  /* checking requires both tables to be sorted */
575 
576  /* check */
577  result=
578  checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
579  checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
580 
581  if(result&HAS_ERRORS) {
582  return FALSE;
583  }
584 
585  if(result&NEEDS_MOVE) {
587  ucm_moveMappings(base, moveTarget);
590  if(moveTarget!=NULL) {
591  ucm_sortTable(moveTarget);
592  }
593  }
594 
595  return TRUE;
596 }
597 
598 /* merge tables for rptp2ucm ------------------------------------------------ */
599 
600 U_CAPI void U_EXPORT2
601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
602  const uint8_t *subchar, int32_t subcharLength,
603  uint8_t subchar1) {
604  UCMapping *fromUMapping, *toUMapping;
605  int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
606 
607  ucm_sortTable(fromUTable);
608  ucm_sortTable(toUTable);
609 
610  fromUMapping=fromUTable->mappings;
611  toUMapping=toUTable->mappings;
612 
613  fromUTop=fromUTable->mappingsLength;
614  toUTop=toUTable->mappingsLength;
615 
616  fromUIndex=toUIndex=0;
617 
618  while(fromUIndex<fromUTop && toUIndex<toUTop) {
619  cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
620  if(cmp==0) {
621  /* equal: roundtrip, nothing to do (flags are initially 0) */
622  ++fromUMapping;
623  ++toUMapping;
624 
625  ++fromUIndex;
626  ++toUIndex;
627  } else if(cmp<0) {
628  /*
629  * the fromU mapping does not have a toU counterpart:
630  * fallback Unicode->codepage
631  */
632  if( (fromUMapping->bLen==subcharLength &&
633  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
634  (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
635  ) {
636  fromUMapping->f=2; /* SUB mapping */
637  } else {
638  fromUMapping->f=1; /* normal fallback */
639  }
640 
641  ++fromUMapping;
642  ++fromUIndex;
643  } else {
644  /*
645  * the toU mapping does not have a fromU counterpart:
646  * (reverse) fallback codepage->Unicode, copy it to the fromU table
647  */
648 
649  /* ignore reverse fallbacks to Unicode SUB */
650  if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
651  toUMapping->f=3; /* reverse fallback */
652  ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
653 
654  /* the table may have been reallocated */
655  fromUMapping=fromUTable->mappings+fromUIndex;
656  }
657 
658  ++toUMapping;
659  ++toUIndex;
660  }
661  }
662 
663  /* either one or both tables are exhausted */
664  while(fromUIndex<fromUTop) {
665  /* leftover fromU mappings are fallbacks */
666  if( (fromUMapping->bLen==subcharLength &&
667  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
668  (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
669  ) {
670  fromUMapping->f=2; /* SUB mapping */
671  } else {
672  fromUMapping->f=1; /* normal fallback */
673  }
674 
675  ++fromUMapping;
676  ++fromUIndex;
677  }
678 
679  while(toUIndex<toUTop) {
680  /* leftover toU mappings are reverse fallbacks */
681 
682  /* ignore reverse fallbacks to Unicode SUB */
683  if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
684  toUMapping->f=3; /* reverse fallback */
685  ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
686  }
687 
688  ++toUMapping;
689  ++toUIndex;
690  }
691 
692  fromUTable->isSorted=FALSE;
693 }
694 
695 /* separate extension mappings out of base table for rptp2ucm --------------- */
696 
699  UCMTable *table;
700  UCMapping *m, *mLimit;
701  int32_t type;
702  UBool needsMove, isOK;
703 
704  table=ucm->base;
705  m=table->mappings;
706  mLimit=m+table->mappingsLength;
707 
708  needsMove=FALSE;
709  isOK=TRUE;
710 
711  for(; m<mLimit; ++m) {
712  if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
713  fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
714  ucm_printMapping(table, m, stderr);
715  m->moveFlag|=UCM_REMOVE_MAPPING;
716  needsMove=TRUE;
717  continue;
718  }
719 
721  &ucm->states, m,
723  if(type<0) {
724  /* illegal byte sequence */
726  isOK=FALSE;
727  } else if(type>0) {
728  m->moveFlag|=UCM_MOVE_TO_EXT;
729  needsMove=TRUE;
730  }
731  }
732 
733  if(!isOK) {
734  return FALSE;
735  }
736  if(needsMove) {
737  ucm_moveMappings(ucm->base, ucm->ext);
738  return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
739  } else {
740  ucm_sortTable(ucm->base);
741  return TRUE;
742  }
743 }
744 
745 /* ucm parser --------------------------------------------------------------- */
746 
748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
749  const char *s=*ps;
750  char *end;
751  uint8_t byte;
752  int8_t bLen;
753 
754  bLen=0;
755  for(;;) {
756  /* skip an optional plus sign */
757  if(bLen>0 && *s=='+') {
758  ++s;
759  }
760  if(*s!='\\') {
761  break;
762  }
763 
764  if( s[1]!='x' ||
765  (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
766  ) {
767  fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
768  return -1;
769  }
770 
771  if(bLen==UCNV_EXT_MAX_BYTES) {
772  fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
773  return -1;
774  }
775  bytes[bLen++]=byte;
776  s=end;
777  }
778 
779  *ps=s;
780  return bLen;
781 }
782 
783 /* parse a mapping line; must not be empty */
786  UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
788  const char *line) {
789  const char *s;
790  char *end;
791  UChar32 cp;
792  int32_t u16Length;
793  int8_t uLen, bLen, f;
794 
795  s=line;
796  uLen=bLen=0;
797 
798  /* parse code points */
799  for(;;) {
800  /* skip an optional plus sign */
801  if(uLen>0 && *s=='+') {
802  ++s;
803  }
804  if(*s!='<') {
805  break;
806  }
807 
808  if( s[1]!='U' ||
809  (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
810  *end!='>'
811  ) {
812  fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
813  return FALSE;
814  }
815  if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
816  fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
817  return FALSE;
818  }
819 
820  if(uLen==UCNV_EXT_MAX_UCHARS) {
821  fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
822  return FALSE;
823  }
824  codePoints[uLen++]=cp;
825  s=end+1;
826  }
827 
828  if(uLen==0) {
829  fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
830  return FALSE;
831  } else if(uLen==1) {
832  m->u=codePoints[0];
833  } else {
835  u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
837  u16Length>UCNV_EXT_MAX_UCHARS
838  ) {
839  fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
840  return FALSE;
841  }
842  }
843 
845 
846  /* parse bytes */
847  bLen=ucm_parseBytes(bytes, line, &s);
848 
849  if(bLen<0) {
850  return FALSE;
851  } else if(bLen==0) {
852  fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
853  return FALSE;
854  } else if(bLen<=4) {
855  uprv_memcpy(m->b.bytes, bytes, bLen);
856  }
857 
858  /* skip everything until the fallback indicator, even the start of a comment */
859  for(;;) {
860  if(*s==0) {
861  f=-1; /* no fallback indicator */
862  break;
863  } else if(*s=='|') {
864  f=(int8_t)(s[1]-'0');
865  if((uint8_t)f>4) {
866  fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
867  return FALSE;
868  }
869  break;
870  }
871  ++s;
872  }
873 
874  m->uLen=uLen;
875  m->bLen=bLen;
876  m->f=f;
877  return TRUE;
878 }
879 
880 /* general APIs ------------------------------------------------------------- */
881 
885  if(table==NULL) {
886  fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
888  }
889 
890  memset(table, 0, sizeof(UCMTable));
891  return table;
892 }
893 
894 U_CAPI void U_EXPORT2
896  if(table!=NULL) {
897  uprv_free(table->mappings);
898  uprv_free(table->codePoints);
899  uprv_free(table->bytes);
900  uprv_free(table->reverseMap);
901  uprv_free(table);
902  }
903 }
904 
905 U_CAPI void U_EXPORT2
907  if(table!=NULL) {
908  table->mappingsLength=0;
909  table->flagsType=0;
910  table->unicodeMask=0;
911  table->bytesLength=table->codePointsLength=0;
912  table->isSorted=FALSE;
913  }
914 }
915 
916 U_CAPI void U_EXPORT2
918  UCMapping *m,
919  UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
921  UCMapping *tm;
922  UChar32 c;
923  int32_t idx;
924 
925  if(table->mappingsLength>=table->mappingsCapacity) {
926  /* make the mappings array larger */
927  if(table->mappingsCapacity==0) {
928  table->mappingsCapacity=1000;
929  } else {
930  table->mappingsCapacity*=10;
931  }
932  table->mappings=(UCMapping *)uprv_realloc(table->mappings,
933  table->mappingsCapacity*sizeof(UCMapping));
934  if(table->mappings==NULL) {
935  fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
936  (int)table->mappingsCapacity);
938  }
939 
940  if(table->reverseMap!=NULL) {
941  /* the reverseMap must be reallocated in a new sort */
942  uprv_free(table->reverseMap);
943  table->reverseMap=NULL;
944  }
945  }
946 
947  if(m->uLen>1 && table->codePointsCapacity==0) {
948  table->codePointsCapacity=10000;
949  table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
950  if(table->codePoints==NULL) {
951  fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
952  (int)table->codePointsCapacity);
954  }
955  }
956 
957  if(m->bLen>4 && table->bytesCapacity==0) {
958  table->bytesCapacity=10000;
959  table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
960  if(table->bytes==NULL) {
961  fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
962  (int)table->bytesCapacity);
964  }
965  }
966 
967  if(m->uLen>1) {
968  idx=table->codePointsLength;
969  table->codePointsLength+=m->uLen;
970  if(table->codePointsLength>table->codePointsCapacity) {
971  fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
973  }
974 
975  uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
976  m->u=idx;
977  }
978 
979  if(m->bLen>4) {
980  idx=table->bytesLength;
981  table->bytesLength+=m->bLen;
982  if(table->bytesLength>table->bytesCapacity) {
983  fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
985  }
986 
987  uprv_memcpy(table->bytes+idx, bytes, m->bLen);
988  m->b.idx=idx;
989  }
990 
991  /* set unicodeMask */
992  for(idx=0; idx<m->uLen; ++idx) {
993  c=codePoints[idx];
994  if(c>=0x10000) {
995  table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
996  } else if(U_IS_SURROGATE(c)) {
997  table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
998  }
999  }
1000 
1001  /* set flagsType */
1002  if(m->f<0) {
1003  table->flagsType|=UCM_FLAGS_IMPLICIT;
1004  } else {
1005  table->flagsType|=UCM_FLAGS_EXPLICIT;
1006  }
1007 
1008  tm=table->mappings+table->mappingsLength++;
1009  uprv_memcpy(tm, m, sizeof(UCMapping));
1010 
1011  table->isSorted=FALSE;
1012 }
1013 
1016  UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1017  if(ucm==NULL) {
1018  fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1020  }
1021 
1022  memset(ucm, 0, sizeof(UCMFile));
1023 
1024  ucm->base=ucm_openTable();
1025  ucm->ext=ucm_openTable();
1026 
1029  ucm->states.outputType=-1;
1031 
1032  return ucm;
1033 }
1034 
1035 U_CAPI void U_EXPORT2
1037  if(ucm!=NULL) {
1038  ucm_closeTable(ucm->base);
1039  ucm_closeTable(ucm->ext);
1040  uprv_free(ucm);
1041  }
1042 }
1043 
1046  UCMapping *m,
1047  UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1049  (void)codePoints;
1050  /* check validity of the bytes and count the characters in them */
1051  int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1052  if(count<1) {
1053  /* illegal byte sequence */
1054  return -1;
1055  }
1056 
1057  /*
1058  * Suitable for an ICU conversion base table means:
1059  * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1060  * - precision flag 0..3
1061  * - SBCS: any 1:1 mapping
1062  * (the table stores additional bits to distinguish mapping types)
1063  * - MBCS: not a |2 SUB mapping for <subchar1>
1064  * - MBCS: not a |1 fallback to 0x00
1065  * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1066  *
1067  * Further restrictions for fromUnicode tables
1068  * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1069  *
1070  * All of the MBCS fromUnicode specific tests could be removed from here,
1071  * but the ones above are for unusual mappings, and removing the tests
1072  * from here would change canonucm output which seems gratuitous.
1073  * (Markus Scherer 2006-nov-28)
1074  *
1075  * Exception: All implicit mappings (f<0) that need to be moved
1076  * because of fromUnicode restrictions _must_ be moved here because
1077  * makeconv uses a hack for moving mappings only for the fromUnicode table
1078  * that only works with non-negative values of f.
1079  */
1080  if( m->uLen==1 && count==1 && m->f<=3 &&
1081  (baseStates->maxCharLength==1 ||
1082  !((m->f==2 && m->bLen==1) ||
1083  (m->f==1 && bytes[0]==0) ||
1084  (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1085  ) {
1086  return 0; /* suitable for a base table */
1087  } else {
1088  return 1; /* needs to go into an extension table */
1089  }
1090 }
1091 
1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1094  UCMapping *m,
1095  UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1097  int32_t type;
1098 
1099  if(m->f==2 && m->uLen>1) {
1100  fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1101  printMapping(m, codePoints, bytes, stderr);
1102  return FALSE;
1103  }
1104 
1105  if(baseStates!=NULL) {
1106  /* check validity of the bytes and count the characters in them */
1107  type=ucm_mappingType(baseStates, m, codePoints, bytes);
1108  if(type<0) {
1109  /* illegal byte sequence */
1110  printMapping(m, codePoints, bytes, stderr);
1111  return FALSE;
1112  }
1113  } else {
1114  /* not used - adding a mapping for an extension-only table before its base table is read */
1115  type=1;
1116  }
1117 
1118  /*
1119  * Add the mapping to the base table if this is requested and suitable.
1120  * Otherwise, add it to the extension table.
1121  */
1122  if(forBase && type==0) {
1123  ucm_addMapping(ucm->base, m, codePoints, bytes);
1124  } else {
1125  ucm_addMapping(ucm->ext, m, codePoints, bytes);
1126  }
1127 
1128  return TRUE;
1129 }
1130 
1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1133  UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1134  UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1136 
1137  const char *s;
1138 
1139  /* ignore empty and comment lines */
1140  if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1141  return TRUE;
1142  }
1143 
1144  return
1145  ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1146  ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1147 }
1148 
1149 U_CAPI void U_EXPORT2
1151  UBool forBase, UCMStates *baseStates,
1152  UErrorCode *pErrorCode) {
1153  char line[500];
1154  char *end;
1155  UBool isOK;
1156 
1157  if(U_FAILURE(*pErrorCode)) {
1158  return;
1159  }
1160 
1161  isOK=TRUE;
1162 
1163  for(;;) {
1164  /* read the next line */
1165  if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1166  fprintf(stderr, "incomplete charmap section\n");
1167  isOK=FALSE;
1168  break;
1169  }
1170 
1171  /* remove CR LF */
1172  end=uprv_strchr(line, 0);
1173  while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1174  --end;
1175  }
1176  *end=0;
1177 
1178  /* ignore empty and comment lines */
1179  if(line[0]==0 || line[0]=='#') {
1180  continue;
1181  }
1182 
1183  /* stop at the end of the mapping table */
1184  if(0==uprv_strcmp(line, "END CHARMAP")) {
1185  break;
1186  }
1187 
1188  isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1189  }
1190 
1191  if(!isOK) {
1192  *pErrorCode=U_INVALID_TABLE_FORMAT;
1193  }
1194 }
1195 #endif
cp
Definition: action.c:1035
struct @88 table[500]
#define type(a)
Definition: aptex-macros.h:171
#define count(a)
Definition: aptex-macros.h:781
int cmp(const void *p, const void *q)
Definition: bkmk2uni.c:1611
#define uprv_memcmp(buffer1, buffer2, size)
Definition: cmemory.h:52
#define uprv_memcpy(dst, src, size)
Definition: cmemory.h:40
#define b
Definition: jpegint.h:372
#define uprv_strcmp(s1, s2)
Definition: cstring.h:38
#define uprv_strtoul(str, end, base)
Definition: cstring.h:76
#define uprv_strchr(s, c)
Definition: cstring.h:40
@ FALSE
Definition: dd.h:101
@ TRUE
Definition: dd.h:102
long int flag
Definition: f2c.h:53
char * T_FileStream_readLine(FileStream *fileStream, char *buffer, int32_t length)
Definition: filestrm.cpp:153
static void
Definition: fpif.c:118
mpz_t * f
Definition: gen-fib.c:34
#define s
Definition: afcover.h:80
#define c(n)
Definition: gpos-common.c:150
int base
Definition: gsftopk.c:1502
#define byte
Definition: in_pcx.cpp:28
#define NULL
Definition: ftobjs.h:61
small capitals from c petite p scientific i
Definition: afcover.h:80
FT_UInt idx
Definition: cffcmap.c:135
void exit()
@ right
Definition: annotate.c:15
unsigned int uint32_t
Definition: stdint.h:80
signed int int32_t
Definition: stdint.h:77
unsigned char uint8_t
Definition: stdint.h:78
signed char int8_t
Definition: stdint.h:75
#define fputs
Definition: mendex.h:67
#define fprintf
Definition: mendex.h:64
#define length(c)
Definition: ctangleboot.c:65
#define U_CALLCONV
Definition: platform.h:877
#define U_EXPORT2
Definition: platform.h:844
static struct tm tm
Definition: localtime.c:216
static bool ps
Definition: pdftocairo.cc:91
int r
Definition: ppmqvga.c:68
char line[1024]
Definition: process_score.c:29
bstring c int memset(void *s, int c, int length)
static long bytes
Definition: psutil.c:35
#define map
C API: Unicode string handling functions.
#define flag
Definition: round_prec.c:45
ShellFileEnvironment e
Definition: sh6.c:388
#define int8_t
Definition: stdint.in.h:153
Definition: ucm.h:104
UCMTable * base
Definition: ucm.h:105
UCMTable * ext
Definition: ucm.h:105
UCMStates states
Definition: ucm.h:106
Definition: ucm.h:95
int32_t maxCharLength
Definition: ucm.h:100
int32_t minCharLength
Definition: ucm.h:100
int8_t conversionType
Definition: ucm.h:101
int8_t outputType
Definition: ucm.h:101
uint32_t stateFlags[MBCS_MAX_STATE_COUNT]
Definition: ucm.h:97
Definition: ucm.h:70
UBool isSorted
Definition: ucm.h:85
UCMapping * mappings
Definition: ucm.h:71
int32_t mappingsLength
Definition: ucm.h:72
Definition: ucm.h:53
int8_t uLen
Definition: ucm.h:59
UChar32 u
Definition: ucm.h:54
int8_t moveFlag
Definition: ucm.h:59
int8_t bLen
Definition: ucm.h:59
uint8_t bytes[4]
Definition: ucm.h:57
union UCMapping::@1283 b
int8_t f
Definition: ucm.h:59
Definition: bdf.c:133
Definition: dvips.h:235
Definition: table.h:30
#define FILE
Definition: t1stdio.h:34
int j
Definition: t4ht.c:1589
char * ext
Definition: t4ht.c:938
m
Definition: tex4ht.c:3990
static uint8_t checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase)
Definition: ucm.cpp:418
void ucm_addMapping(UCMTable *table, UCMapping *m, UChar32 codePoints[19], uint8_t bytes[0x1f])
Definition: ucm.cpp:917
UBool ucm_separateMappings(UCMFile *ucm, UBool isSISO)
Definition: ucm.cpp:698
void ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f)
Definition: ucm.cpp:66
UCMFile * ucm_open()
Definition: ucm.cpp:1015
UBool ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, UCMapping *m, UChar32 codePoints[19], uint8_t bytes[0x1f])
Definition: ucm.cpp:1093
static int32_t compareUnicode(UCMTable *lTable, const UCMapping *l, UCMTable *rTable, const UCMapping *r)
Definition: ucm.cpp:92
static int32_t compareMappingsBytesFirst(const void *context, const void *left, const void *right)
Definition: ucm.cpp:213
static int32_t compareMappingsUnicodeFirst(const void *context, const void *left, const void *right)
Definition: ucm.cpp:205
UBool ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UCMTable *moveTarget, UBool intersectBase)
Definition: ucm.cpp:557
void ucm_close(UCMFile *ucm)
Definition: ucm.cpp:1036
void ucm_moveMappings(UCMTable *base, UCMTable *ext)
Definition: ucm.cpp:275
UBool ucm_checkValidity(UCMTable *table, UCMStates *baseStates)
Definition: ucm.cpp:535
void ucm_resetTable(UCMTable *table)
Definition: ucm.cpp:906
static uint8_t checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase)
Definition: ucm.cpp:312
void ucm_sortTable(UCMTable *t)
Definition: ucm.cpp:223
UBool ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates)
Definition: ucm.cpp:1132
int8_t ucm_parseBytes(uint8_t bytes[0x1f], const char *line, const char **ps)
Definition: ucm.cpp:748
int32_t ucm_mappingType(UCMStates *baseStates, UCMapping *m, UChar32 codePoints[19], uint8_t bytes[0x1f])
Definition: ucm.cpp:1045
UCMTable * ucm_openTable()
Definition: ucm.cpp:883
void ucm_closeTable(UCMTable *table)
Definition: ucm.cpp:895
static void printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f)
Definition: ucm.cpp:45
UBool ucm_parseMappingLine(UCMapping *m, UChar32 codePoints[19], uint8_t bytes[0x1f], const char *line)
Definition: ucm.cpp:785
static int32_t compareBytes(UCMTable *lTable, const UCMapping *l, UCMTable *rTable, const UCMapping *r, UBool lexical)
Definition: ucm.cpp:126
void ucm_readTable(UCMFile *ucm, FileStream *convFile, UBool forBase, UCMStates *baseStates, UErrorCode *pErrorCode)
Definition: ucm.cpp:1150
@ HAS_ERRORS
Definition: ucm.cpp:308
@ NEEDS_MOVE
Definition: ucm.cpp:307
void ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, const uint8_t *subchar, int32_t subcharLength, uint8_t subchar1)
Definition: ucm.cpp:601
void ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode)
Definition: ucm.cpp:71
static int32_t compareMappings(UCMTable *lTable, const UCMapping *l, UCMTable *rTable, const UCMapping *r, UBool uFirst)
Definition: ucm.cpp:175
int32_t ucm_countChars(UCMStates *states, const uint8_t *bytes, int32_t length)
Definition: ucmstate.cpp:978
#define UCM_GET_BYTES(t, m)
Definition: ucm.h:116
@ UCM_FLAGS_IMPLICIT
Definition: ucm.h:66
@ UCM_FLAGS_EXPLICIT
Definition: ucm.h:65
#define UCM_GET_CODE_POINTS(t, m)
Definition: ucm.h:113
@ MBCS_STATE_FLAG_DIRECT
Definition: ucm.h:89
@ UCM_REMOVE_MAPPING
Definition: ucm.h:35
@ UCM_MOVE_TO_EXT
Definition: ucm.h:34
@ UCNV_UNSUPPORTED_CONVERTER
Definition: ucnv.h:96
#define UCNV_HAS_SURROGATES
Definition: ucnv_bld.h:65
#define UCNV_HAS_SUPPLEMENTARY
Definition: ucnv_bld.h:64
#define UCNV_EXT_MAX_BYTES
Definition: ucnv_ext.h:465
#define UCNV_EXT_MAX_UCHARS
Definition: ucnv_ext.h:419
@ MBCS_OUTPUT_2_SISO
Definition: ucnvmbcs.h:337
int32_t UChar32
Definition: umachine.h:467
#define U_CDECL_END
Definition: umachine.h:86
int8_t UBool
Definition: umachine.h:269
#define U_CAPI
Definition: umachine.h:110
#define U_CDECL_BEGIN
Definition: umachine.h:85
const char * u_skipWhitespace(const char *s)
Definition: uparse.cpp:35
#define uprv_sortArray
Definition: urename.h:1449
#define u_errorName
Definition: urename.h:226
#define uprv_realloc
Definition: urename.h:1447
#define uprv_malloc
Definition: urename.h:1435
#define uprv_free
Definition: urename.h:1414
#define u_strFromUTF32
Definition: urename.h:358
#define U_IS_SURROGATE(c)
Definition: utf.h:193
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Definition: utypes.h:431
@ U_MEMORY_ALLOCATION_ERROR
Definition: utypes.h:473
@ U_BUFFER_OVERFLOW_ERROR
Definition: utypes.h:481
@ U_ZERO_ERROR
Definition: utypes.h:465
@ U_INVALID_TABLE_FORMAT
Definition: utypes.h:479
#define U_FAILURE(x)
Definition: utypes.h:735
#define errorCode
Definition: xmlparse.c:601
#define end(cp)
Definition: zic.c:71