"Fossies" - the Fresh Open Source Software Archive

Member "sitecopy-0.16.6/lib/expat/xmltok/xmltok.c" (10 Oct 2004, 37002 Bytes) of archive /linux/www/sitecopy-0.16.6.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "xmltok.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2 expat XML parser
    3 Copyright (C) 1998 James Clark
    4 
    5 This program is free software; you can redistribute it and/or
    6 modify it under the terms of the GNU General Public License
    7 as published by the Free Software Foundation; either version 2
    8 of the License, or (at your option) any later version.
    9 
   10 This program is distributed in the hope that it will be useful,
   11 but WITHOUT ANY WARRANTY; without even the implied warranty of
   12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13 GNU General Public License for more details.
   14 
   15 You should have received a copy of the GNU General Public License
   16 along with this program; if not, write to the Free Software
   17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
   18 */
   19 
   20 #include "xmldef.h"
   21 #include "xmltok.h"
   22 #include "nametab.h"
   23 
   24 #ifdef XML_DTD
   25 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
   26 #else
   27 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
   28 #endif
   29 
   30 #define VTABLE1 \
   31   { PREFIX(prologTok), PREFIX(contentTok), \
   32     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
   33   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
   34   PREFIX(sameName), \
   35   PREFIX(nameMatchesAscii), \
   36   PREFIX(nameLength), \
   37   PREFIX(skipS), \
   38   PREFIX(getAtts), \
   39   PREFIX(charRefNumber), \
   40   PREFIX(predefinedEntityName), \
   41   PREFIX(updatePosition), \
   42   PREFIX(isPublicId)
   43 
   44 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
   45 
   46 #define UCS2_GET_NAMING(pages, hi, lo) \
   47    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
   48 
   49 /* A 2 byte UTF-8 representation splits the characters 11 bits
   50 between the bottom 5 and 6 bits of the bytes.
   51 We need 8 bits to index into pages, 3 bits to add to that index and
   52 5 bits to generate the mask. */
   53 #define UTF8_GET_NAMING2(pages, byte) \
   54     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
   55                       + ((((byte)[0]) & 3) << 1) \
   56                       + ((((byte)[1]) >> 5) & 1)] \
   57          & (1 << (((byte)[1]) & 0x1F)))
   58 
   59 /* A 3 byte UTF-8 representation splits the characters 16 bits
   60 between the bottom 4, 6 and 6 bits of the bytes.
   61 We need 8 bits to index into pages, 3 bits to add to that index and
   62 5 bits to generate the mask. */
   63 #define UTF8_GET_NAMING3(pages, byte) \
   64   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
   65                              + ((((byte)[1]) >> 2) & 0xF)] \
   66                << 3) \
   67                       + ((((byte)[1]) & 3) << 1) \
   68                       + ((((byte)[2]) >> 5) & 1)] \
   69          & (1 << (((byte)[2]) & 0x1F)))
   70 
   71 #define UTF8_GET_NAMING(pages, p, n) \
   72   ((n) == 2 \
   73   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
   74   : ((n) == 3 \
   75      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
   76      : 0))
   77 
   78 #define UTF8_INVALID3(p) \
   79   ((*p) == 0xED \
   80   ? (((p)[1] & 0x20) != 0) \
   81   : ((*p) == 0xEF \
   82      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
   83      : 0))
   84 
   85 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
   86 
   87 static
   88 int isNever(const ENCODING *enc, const char *p)
   89 {
   90   return 0;
   91 }
   92 
   93 static
   94 int utf8_isName2(const ENCODING *enc, const char *p)
   95 {
   96   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
   97 }
   98 
   99 static
  100 int utf8_isName3(const ENCODING *enc, const char *p)
  101 {
  102   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  103 }
  104 
  105 #define utf8_isName4 isNever
  106 
  107 static
  108 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
  109 {
  110   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  111 }
  112 
  113 static
  114 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
  115 {
  116   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  117 }
  118 
  119 #define utf8_isNmstrt4 isNever
  120 
  121 #define utf8_isInvalid2 isNever
  122 
  123 static
  124 int utf8_isInvalid3(const ENCODING *enc, const char *p)
  125 {
  126   return UTF8_INVALID3((const unsigned char *)p);
  127 }
  128 
  129 static
  130 int utf8_isInvalid4(const ENCODING *enc, const char *p)
  131 {
  132   return UTF8_INVALID4((const unsigned char *)p);
  133 }
  134 
  135 struct normal_encoding {
  136   ENCODING enc;
  137   unsigned char type[256];
  138 #ifdef XML_MIN_SIZE
  139   int (*byteType)(const ENCODING *, const char *);
  140   int (*isNameMin)(const ENCODING *, const char *);
  141   int (*isNmstrtMin)(const ENCODING *, const char *);
  142   int (*byteToAscii)(const ENCODING *, const char *);
  143   int (*charMatches)(const ENCODING *, const char *, int);
  144 #endif /* XML_MIN_SIZE */
  145   int (*isName2)(const ENCODING *, const char *);
  146   int (*isName3)(const ENCODING *, const char *);
  147   int (*isName4)(const ENCODING *, const char *);
  148   int (*isNmstrt2)(const ENCODING *, const char *);
  149   int (*isNmstrt3)(const ENCODING *, const char *);
  150   int (*isNmstrt4)(const ENCODING *, const char *);
  151   int (*isInvalid2)(const ENCODING *, const char *);
  152   int (*isInvalid3)(const ENCODING *, const char *);
  153   int (*isInvalid4)(const ENCODING *, const char *);
  154 };
  155 
  156 #ifdef XML_MIN_SIZE
  157 
  158 #define STANDARD_VTABLE(E) \
  159  E ## byteType, \
  160  E ## isNameMin, \
  161  E ## isNmstrtMin, \
  162  E ## byteToAscii, \
  163  E ## charMatches,
  164 
  165 #else
  166 
  167 #define STANDARD_VTABLE(E) /* as nothing */
  168 
  169 #endif
  170 
  171 #define NORMAL_VTABLE(E) \
  172  E ## isName2, \
  173  E ## isName3, \
  174  E ## isName4, \
  175  E ## isNmstrt2, \
  176  E ## isNmstrt3, \
  177  E ## isNmstrt4, \
  178  E ## isInvalid2, \
  179  E ## isInvalid3, \
  180  E ## isInvalid4
  181 
  182 static int checkCharRefNumber(int);
  183 
  184 #include "xmltok_impl.h"
  185 
  186 #ifdef XML_MIN_SIZE
  187 #define sb_isNameMin isNever
  188 #define sb_isNmstrtMin isNever
  189 #endif
  190 
  191 #ifdef XML_MIN_SIZE
  192 #define MINBPC(enc) ((enc)->minBytesPerChar)
  193 #else
  194 /* minimum bytes per character */
  195 #define MINBPC(enc) 1
  196 #endif
  197 
  198 #define SB_BYTE_TYPE(enc, p) \
  199   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  200 
  201 #ifdef XML_MIN_SIZE
  202 static
  203 int sb_byteType(const ENCODING *enc, const char *p)
  204 {
  205   return SB_BYTE_TYPE(enc, p);
  206 }
  207 #define BYTE_TYPE(enc, p) \
  208  (((const struct normal_encoding *)(enc))->byteType(enc, p))
  209 #else
  210 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  211 #endif
  212 
  213 #ifdef XML_MIN_SIZE
  214 #define BYTE_TO_ASCII(enc, p) \
  215  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
  216 static
  217 int sb_byteToAscii(const ENCODING *enc, const char *p)
  218 {
  219   return *p;
  220 }
  221 #else
  222 #define BYTE_TO_ASCII(enc, p) (*p)
  223 #endif
  224 
  225 #define IS_NAME_CHAR(enc, p, n) \
  226  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
  227 #define IS_NMSTRT_CHAR(enc, p, n) \
  228  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
  229 #define IS_INVALID_CHAR(enc, p, n) \
  230  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
  231 
  232 #ifdef XML_MIN_SIZE
  233 #define IS_NAME_CHAR_MINBPC(enc, p) \
  234  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
  235 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  236  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
  237 #else
  238 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  240 #endif
  241 
  242 #ifdef XML_MIN_SIZE
  243 #define CHAR_MATCHES(enc, p, c) \
  244  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
  245 static
  246 int sb_charMatches(const ENCODING *enc, const char *p, int c)
  247 {
  248   return *p == c;
  249 }
  250 #else
  251 /* c is an ASCII character */
  252 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  253 #endif
  254 
  255 #define PREFIX(ident) normal_ ## ident
  256 #include "xmltok_impl.c"
  257 
  258 #undef MINBPC
  259 #undef BYTE_TYPE
  260 #undef BYTE_TO_ASCII
  261 #undef CHAR_MATCHES
  262 #undef IS_NAME_CHAR
  263 #undef IS_NAME_CHAR_MINBPC
  264 #undef IS_NMSTRT_CHAR
  265 #undef IS_NMSTRT_CHAR_MINBPC
  266 #undef IS_INVALID_CHAR
  267 
  268 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
  269   UTF8_cval1 = 0x00,
  270   UTF8_cval2 = 0xc0,
  271   UTF8_cval3 = 0xe0,
  272   UTF8_cval4 = 0xf0
  273 };
  274 
  275 static
  276 void utf8_toUtf8(const ENCODING *enc,
  277          const char **fromP, const char *fromLim,
  278          char **toP, const char *toLim)
  279 {
  280   char *to;
  281   const char *from;
  282   if (fromLim - *fromP > toLim - *toP) {
  283     /* Avoid copying partial characters. */
  284     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  285       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  286     break;
  287   }
  288   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  289     *to = *from;
  290   *fromP = from;
  291   *toP = to;
  292 }
  293 
  294 static
  295 void utf8_toUtf16(const ENCODING *enc,
  296           const char **fromP, const char *fromLim,
  297           unsigned short **toP, const unsigned short *toLim)
  298 {
  299   unsigned short *to = *toP;
  300   const char *from = *fromP;
  301   while (from != fromLim && to != toLim) {
  302     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  303     case BT_LEAD2:
  304       *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
  305       from += 2;
  306       break;
  307     case BT_LEAD3:
  308       *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
  309       from += 3;
  310       break;
  311     case BT_LEAD4:
  312       {
  313     unsigned long n;
  314     if (to + 1 == toLim)
  315       break;
  316     n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  317     n -= 0x10000;
  318     to[0] = (unsigned short)((n >> 10) | 0xD800);
  319     to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  320     to += 2;
  321     from += 4;
  322       }
  323       break;
  324     default:
  325       *to++ = *from++;
  326       break;
  327     }
  328   }
  329   *fromP = from;
  330   *toP = to;
  331 }
  332 
  333 #ifdef XML_NS
  334 static const struct normal_encoding utf8_encoding_ns = {
  335   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  336   {
  337 #include "asciitab.h"
  338 #include "utf8tab.h"
  339   },
  340   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  341 };
  342 #endif
  343 
  344 static const struct normal_encoding utf8_encoding = {
  345   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  346   {
  347 #define BT_COLON BT_NMSTRT
  348 #include "asciitab.h"
  349 #undef BT_COLON
  350 #include "utf8tab.h"
  351   },
  352   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  353 };
  354 
  355 #ifdef XML_NS
  356 
  357 static const struct normal_encoding internal_utf8_encoding_ns = {
  358   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  359   {
  360 #include "iasciitab.h"
  361 #include "utf8tab.h"
  362   },
  363   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  364 };
  365 
  366 #endif
  367 
  368 static const struct normal_encoding internal_utf8_encoding = {
  369   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  370   {
  371 #define BT_COLON BT_NMSTRT
  372 #include "iasciitab.h"
  373 #undef BT_COLON
  374 #include "utf8tab.h"
  375   },
  376   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  377 };
  378 
  379 static
  380 void latin1_toUtf8(const ENCODING *enc,
  381            const char **fromP, const char *fromLim,
  382            char **toP, const char *toLim)
  383 {
  384   for (;;) {
  385     unsigned char c;
  386     if (*fromP == fromLim)
  387       break;
  388     c = (unsigned char)**fromP;
  389     if (c & 0x80) {
  390       if (toLim - *toP < 2)
  391     break;
  392       *(*toP)++ = ((c >> 6) | UTF8_cval2);
  393       *(*toP)++ = ((c & 0x3f) | 0x80);
  394       (*fromP)++;
  395     }
  396     else {
  397       if (*toP == toLim)
  398     break;
  399       *(*toP)++ = *(*fromP)++;
  400     }
  401   }
  402 }
  403 
  404 static
  405 void latin1_toUtf16(const ENCODING *enc,
  406             const char **fromP, const char *fromLim,
  407             unsigned short **toP, const unsigned short *toLim)
  408 {
  409   while (*fromP != fromLim && *toP != toLim)
  410     *(*toP)++ = (unsigned char)*(*fromP)++;
  411 }
  412 
  413 #ifdef XML_NS
  414 
  415 static const struct normal_encoding latin1_encoding_ns = {
  416   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  417   {
  418 #include "asciitab.h"
  419 #include "latin1tab.h"
  420   },
  421   STANDARD_VTABLE(sb_)
  422 };
  423 
  424 #endif
  425 
  426 static const struct normal_encoding latin1_encoding = {
  427   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  428   {
  429 #define BT_COLON BT_NMSTRT
  430 #include "asciitab.h"
  431 #undef BT_COLON
  432 #include "latin1tab.h"
  433   },
  434   STANDARD_VTABLE(sb_)
  435 };
  436 
  437 static
  438 void ascii_toUtf8(const ENCODING *enc,
  439           const char **fromP, const char *fromLim,
  440           char **toP, const char *toLim)
  441 {
  442   while (*fromP != fromLim && *toP != toLim)
  443     *(*toP)++ = *(*fromP)++;
  444 }
  445 
  446 #ifdef XML_NS
  447 
  448 static const struct normal_encoding ascii_encoding_ns = {
  449   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  450   {
  451 #include "asciitab.h"
  452 /* BT_NONXML == 0 */
  453   },
  454   STANDARD_VTABLE(sb_)
  455 };
  456 
  457 #endif
  458 
  459 static const struct normal_encoding ascii_encoding = {
  460   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  461   {
  462 #define BT_COLON BT_NMSTRT
  463 #include "asciitab.h"
  464 #undef BT_COLON
  465 /* BT_NONXML == 0 */
  466   },
  467   STANDARD_VTABLE(sb_)
  468 };
  469 
  470 static int unicode_byte_type(char hi, char lo)
  471 {
  472   switch ((unsigned char)hi) {
  473   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  474     return BT_LEAD4;
  475   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  476     return BT_TRAIL;
  477   case 0xFF:
  478     switch ((unsigned char)lo) {
  479     case 0xFF:
  480     case 0xFE:
  481       return BT_NONXML;
  482     }
  483     break;
  484   }
  485   return BT_NONASCII;
  486 }
  487 
  488 #define DEFINE_UTF16_TO_UTF8(E) \
  489 static \
  490 void E ## toUtf8(const ENCODING *enc, \
  491          const char **fromP, const char *fromLim, \
  492          char **toP, const char *toLim) \
  493 { \
  494   const char *from; \
  495   for (from = *fromP; from != fromLim; from += 2) { \
  496     int plane; \
  497     unsigned char lo2; \
  498     unsigned char lo = GET_LO(from); \
  499     unsigned char hi = GET_HI(from); \
  500     switch (hi) { \
  501     case 0: \
  502       if (lo < 0x80) { \
  503         if (*toP == toLim) { \
  504           *fromP = from; \
  505       return; \
  506         } \
  507         *(*toP)++ = lo; \
  508         break; \
  509       } \
  510       /* fall through */ \
  511     case 0x1: case 0x2: case 0x3: \
  512     case 0x4: case 0x5: case 0x6: case 0x7: \
  513       if (toLim -  *toP < 2) { \
  514         *fromP = from; \
  515     return; \
  516       } \
  517       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
  518       *(*toP)++ = ((lo & 0x3f) | 0x80); \
  519       break; \
  520     default: \
  521       if (toLim -  *toP < 3)  { \
  522         *fromP = from; \
  523     return; \
  524       } \
  525       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  526       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  527       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  528       *(*toP)++ = ((lo & 0x3f) | 0x80); \
  529       break; \
  530     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  531       if (toLim -  *toP < 4) { \
  532     *fromP = from; \
  533     return; \
  534       } \
  535       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  536       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  537       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  538       from += 2; \
  539       lo2 = GET_LO(from); \
  540       *(*toP)++ = (((lo & 0x3) << 4) \
  541                | ((GET_HI(from) & 0x3) << 2) \
  542            | (lo2 >> 6) \
  543            | 0x80); \
  544       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  545       break; \
  546     } \
  547   } \
  548   *fromP = from; \
  549 }
  550 
  551 #define DEFINE_UTF16_TO_UTF16(E) \
  552 static \
  553 void E ## toUtf16(const ENCODING *enc, \
  554           const char **fromP, const char *fromLim, \
  555           unsigned short **toP, const unsigned short *toLim) \
  556 { \
  557   /* Avoid copying first half only of surrogate */ \
  558   if (fromLim - *fromP > ((toLim - *toP) << 1) \
  559       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  560     fromLim -= 2; \
  561   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  562     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  563 }
  564 
  565 #define SET2(ptr, ch) \
  566   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  567 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  568 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  569 
  570 DEFINE_UTF16_TO_UTF8(little2_)
  571 DEFINE_UTF16_TO_UTF16(little2_)
  572 
  573 #undef SET2
  574 #undef GET_LO
  575 #undef GET_HI
  576 
  577 #define SET2(ptr, ch) \
  578   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  579 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  580 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  581 
  582 DEFINE_UTF16_TO_UTF8(big2_)
  583 DEFINE_UTF16_TO_UTF16(big2_)
  584 
  585 #undef SET2
  586 #undef GET_LO
  587 #undef GET_HI
  588 
  589 #define LITTLE2_BYTE_TYPE(enc, p) \
  590  ((p)[1] == 0 \
  591   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  592   : unicode_byte_type((p)[1], (p)[0]))
  593 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  594 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  595 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  596   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  597 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  598   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  599 
  600 #ifdef XML_MIN_SIZE
  601 
  602 static
  603 int little2_byteType(const ENCODING *enc, const char *p)
  604 {
  605   return LITTLE2_BYTE_TYPE(enc, p);
  606 }
  607 
  608 static
  609 int little2_byteToAscii(const ENCODING *enc, const char *p)
  610 {
  611   return LITTLE2_BYTE_TO_ASCII(enc, p);
  612 }
  613 
  614 static
  615 int little2_charMatches(const ENCODING *enc, const char *p, int c)
  616 {
  617   return LITTLE2_CHAR_MATCHES(enc, p, c);
  618 }
  619 
  620 static
  621 int little2_isNameMin(const ENCODING *enc, const char *p)
  622 {
  623   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  624 }
  625 
  626 static
  627 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
  628 {
  629   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  630 }
  631 
  632 #undef VTABLE
  633 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  634 
  635 #else /* not XML_MIN_SIZE */
  636 
  637 #undef PREFIX
  638 #define PREFIX(ident) little2_ ## ident
  639 #define MINBPC(enc) 2
  640 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  641 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  642 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 
  643 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  644 #define IS_NAME_CHAR(enc, p, n) 0
  645 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  646 #define IS_NMSTRT_CHAR(enc, p, n) (0)
  647 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  648 
  649 #include "xmltok_impl.c"
  650 
  651 #undef MINBPC
  652 #undef BYTE_TYPE
  653 #undef BYTE_TO_ASCII
  654 #undef CHAR_MATCHES
  655 #undef IS_NAME_CHAR
  656 #undef IS_NAME_CHAR_MINBPC
  657 #undef IS_NMSTRT_CHAR
  658 #undef IS_NMSTRT_CHAR_MINBPC
  659 #undef IS_INVALID_CHAR
  660 
  661 #endif /* not XML_MIN_SIZE */
  662 
  663 #ifdef XML_NS
  664 
  665 static const struct normal_encoding little2_encoding_ns = { 
  666   { VTABLE, 2, 0,
  667 #if XML_BYTE_ORDER == 12
  668     1
  669 #else
  670     0
  671 #endif
  672   },
  673   {
  674 #include "asciitab.h"
  675 #include "latin1tab.h"
  676   },
  677   STANDARD_VTABLE(little2_)
  678 };
  679 
  680 #endif
  681 
  682 static const struct normal_encoding little2_encoding = { 
  683   { VTABLE, 2, 0,
  684 #if XML_BYTE_ORDER == 12
  685     1
  686 #else
  687     0
  688 #endif
  689   },
  690   {
  691 #define BT_COLON BT_NMSTRT
  692 #include "asciitab.h"
  693 #undef BT_COLON
  694 #include "latin1tab.h"
  695   },
  696   STANDARD_VTABLE(little2_)
  697 };
  698 
  699 #if XML_BYTE_ORDER != 21
  700 
  701 #ifdef XML_NS
  702 
  703 static const struct normal_encoding internal_little2_encoding_ns = { 
  704   { VTABLE, 2, 0, 1 },
  705   {
  706 #include "iasciitab.h"
  707 #include "latin1tab.h"
  708   },
  709   STANDARD_VTABLE(little2_)
  710 };
  711 
  712 #endif
  713 
  714 static const struct normal_encoding internal_little2_encoding = { 
  715   { VTABLE, 2, 0, 1 },
  716   {
  717 #define BT_COLON BT_NMSTRT
  718 #include "iasciitab.h"
  719 #undef BT_COLON
  720 #include "latin1tab.h"
  721   },
  722   STANDARD_VTABLE(little2_)
  723 };
  724 
  725 #endif
  726 
  727 
  728 #define BIG2_BYTE_TYPE(enc, p) \
  729  ((p)[0] == 0 \
  730   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  731   : unicode_byte_type((p)[0], (p)[1]))
  732 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  733 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  734 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  735   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  736 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  737   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  738 
  739 #ifdef XML_MIN_SIZE
  740 
  741 static
  742 int big2_byteType(const ENCODING *enc, const char *p)
  743 {
  744   return BIG2_BYTE_TYPE(enc, p);
  745 }
  746 
  747 static
  748 int big2_byteToAscii(const ENCODING *enc, const char *p)
  749 {
  750   return BIG2_BYTE_TO_ASCII(enc, p);
  751 }
  752 
  753 static
  754 int big2_charMatches(const ENCODING *enc, const char *p, int c)
  755 {
  756   return BIG2_CHAR_MATCHES(enc, p, c);
  757 }
  758 
  759 static
  760 int big2_isNameMin(const ENCODING *enc, const char *p)
  761 {
  762   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  763 }
  764 
  765 static
  766 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
  767 {
  768   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  769 }
  770 
  771 #undef VTABLE
  772 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  773 
  774 #else /* not XML_MIN_SIZE */
  775 
  776 #undef PREFIX
  777 #define PREFIX(ident) big2_ ## ident
  778 #define MINBPC(enc) 2
  779 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  780 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  781 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 
  782 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  783 #define IS_NAME_CHAR(enc, p, n) 0
  784 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  785 #define IS_NMSTRT_CHAR(enc, p, n) (0)
  786 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  787 
  788 #include "xmltok_impl.c"
  789 
  790 #undef MINBPC
  791 #undef BYTE_TYPE
  792 #undef BYTE_TO_ASCII
  793 #undef CHAR_MATCHES
  794 #undef IS_NAME_CHAR
  795 #undef IS_NAME_CHAR_MINBPC
  796 #undef IS_NMSTRT_CHAR
  797 #undef IS_NMSTRT_CHAR_MINBPC
  798 #undef IS_INVALID_CHAR
  799 
  800 #endif /* not XML_MIN_SIZE */
  801 
  802 #ifdef XML_NS
  803 
  804 static const struct normal_encoding big2_encoding_ns = {
  805   { VTABLE, 2, 0,
  806 #if XML_BYTE_ORDER == 21
  807   1
  808 #else
  809   0
  810 #endif
  811   },
  812   {
  813 #include "asciitab.h"
  814 #include "latin1tab.h"
  815   },
  816   STANDARD_VTABLE(big2_)
  817 };
  818 
  819 #endif
  820 
  821 static const struct normal_encoding big2_encoding = {
  822   { VTABLE, 2, 0,
  823 #if XML_BYTE_ORDER == 21
  824   1
  825 #else
  826   0
  827 #endif
  828   },
  829   {
  830 #define BT_COLON BT_NMSTRT
  831 #include "asciitab.h"
  832 #undef BT_COLON
  833 #include "latin1tab.h"
  834   },
  835   STANDARD_VTABLE(big2_)
  836 };
  837 
  838 #if XML_BYTE_ORDER != 12
  839 
  840 #ifdef XML_NS
  841 
  842 static const struct normal_encoding internal_big2_encoding_ns = {
  843   { VTABLE, 2, 0, 1 },
  844   {
  845 #include "iasciitab.h"
  846 #include "latin1tab.h"
  847   },
  848   STANDARD_VTABLE(big2_)
  849 };
  850 
  851 #endif
  852 
  853 static const struct normal_encoding internal_big2_encoding = {
  854   { VTABLE, 2, 0, 1 },
  855   {
  856 #define BT_COLON BT_NMSTRT
  857 #include "iasciitab.h"
  858 #undef BT_COLON
  859 #include "latin1tab.h"
  860   },
  861   STANDARD_VTABLE(big2_)
  862 };
  863 
  864 #endif
  865 
  866 #undef PREFIX
  867 
  868 static
  869 int streqci(const char *s1, const char *s2)
  870 {
  871   for (;;) {
  872     char c1 = *s1++;
  873     char c2 = *s2++;
  874     if ('a' <= c1 && c1 <= 'z')
  875       c1 += 'A' - 'a';
  876     if ('a' <= c2 && c2 <= 'z')
  877       c2 += 'A' - 'a';
  878     if (c1 != c2)
  879       return 0;
  880     if (!c1)
  881       break;
  882   }
  883   return 1;
  884 }
  885 
  886 static
  887 void initUpdatePosition(const ENCODING *enc, const char *ptr,
  888             const char *end, POSITION *pos)
  889 {
  890   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  891 }
  892 
  893 static
  894 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
  895 {
  896   char buf[1];
  897   char *p = buf;
  898   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  899   if (p == buf)
  900     return -1;
  901   else
  902     return buf[0];
  903 }
  904 
  905 static
  906 int isSpace(int c)
  907 {
  908   switch (c) {
  909   case 0x20:
  910   case 0xD:
  911   case 0xA:
  912   case 0x9: 
  913     return 1;
  914   }
  915   return 0;
  916 }
  917 
  918 /* Return 1 if there's just optional white space
  919 or there's an S followed by name=val. */
  920 static
  921 int parsePseudoAttribute(const ENCODING *enc,
  922              const char *ptr,
  923              const char *end,
  924              const char **namePtr,
  925              const char **nameEndPtr,
  926              const char **valPtr,
  927              const char **nextTokPtr)
  928 {
  929   int c;
  930   char open;
  931   if (ptr == end) {
  932     *namePtr = 0;
  933     return 1;
  934   }
  935   if (!isSpace(toAscii(enc, ptr, end))) {
  936     *nextTokPtr = ptr;
  937     return 0;
  938   }
  939   do {
  940     ptr += enc->minBytesPerChar;
  941   } while (isSpace(toAscii(enc, ptr, end)));
  942   if (ptr == end) {
  943     *namePtr = 0;
  944     return 1;
  945   }
  946   *namePtr = ptr;
  947   for (;;) {
  948     c = toAscii(enc, ptr, end);
  949     if (c == -1) {
  950       *nextTokPtr = ptr;
  951       return 0;
  952     }
  953     if (c == '=') {
  954       *nameEndPtr = ptr;
  955       break;
  956     }
  957     if (isSpace(c)) {
  958       *nameEndPtr = ptr;
  959       do {
  960     ptr += enc->minBytesPerChar;
  961       } while (isSpace(c = toAscii(enc, ptr, end)));
  962       if (c != '=') {
  963     *nextTokPtr = ptr;
  964     return 0;
  965       }
  966       break;
  967     }
  968     ptr += enc->minBytesPerChar;
  969   }
  970   if (ptr == *namePtr) {
  971     *nextTokPtr = ptr;
  972     return 0;
  973   }
  974   ptr += enc->minBytesPerChar;
  975   c = toAscii(enc, ptr, end);
  976   while (isSpace(c)) {
  977     ptr += enc->minBytesPerChar;
  978     c = toAscii(enc, ptr, end);
  979   }
  980   if (c != '"' && c != '\'') {
  981     *nextTokPtr = ptr;
  982     return 0;
  983   }
  984   open = c;
  985   ptr += enc->minBytesPerChar;
  986   *valPtr = ptr;
  987   for (;; ptr += enc->minBytesPerChar) {
  988     c = toAscii(enc, ptr, end);
  989     if (c == open)
  990       break;
  991     if (!('a' <= c && c <= 'z')
  992     && !('A' <= c && c <= 'Z')
  993     && !('0' <= c && c <= '9')
  994     && c != '.'
  995     && c != '-'
  996     && c != '_') {
  997       *nextTokPtr = ptr;
  998       return 0;
  999     }
 1000   }
 1001   *nextTokPtr = ptr + enc->minBytesPerChar;
 1002   return 1;
 1003 }
 1004 
 1005 static
 1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
 1007                                              const char *,
 1008                              const char *),
 1009            int isGeneralTextEntity,
 1010            const ENCODING *enc,
 1011            const char *ptr,
 1012            const char *end,
 1013            const char **badPtr,
 1014            const char **versionPtr,
 1015            const char **encodingName,
 1016            const ENCODING **encoding,
 1017            int *standalone)
 1018 {
 1019   const char *val = 0;
 1020   const char *name = 0;
 1021   const char *nameEnd = 0;
 1022   ptr += 5 * enc->minBytesPerChar;
 1023   end -= 2 * enc->minBytesPerChar;
 1024   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) {
 1025     *badPtr = ptr;
 1026     return 0;
 1027   }
 1028   if (!XmlNameMatchesAscii(enc, name, nameEnd, "version")) {
 1029     if (!isGeneralTextEntity) {
 1030       *badPtr = name;
 1031       return 0;
 1032     }
 1033   }
 1034   else {
 1035     if (versionPtr)
 1036       *versionPtr = val;
 1037     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
 1038       *badPtr = ptr;
 1039       return 0;
 1040     }
 1041     if (!name) {
 1042       if (isGeneralTextEntity) {
 1043     /* a TextDecl must have an EncodingDecl */
 1044     *badPtr = ptr;
 1045     return 0;
 1046       }
 1047       return 1;
 1048     }
 1049   }
 1050   if (XmlNameMatchesAscii(enc, name, nameEnd, "encoding")) {
 1051     int c = toAscii(enc, val, end);
 1052     if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
 1053       *badPtr = val;
 1054       return 0;
 1055     }
 1056     if (encodingName)
 1057       *encodingName = val;
 1058     if (encoding)
 1059       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
 1060     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
 1061       *badPtr = ptr;
 1062       return 0;
 1063     }
 1064     if (!name)
 1065       return 1;
 1066   }
 1067   if (!XmlNameMatchesAscii(enc, name, nameEnd, "standalone") || isGeneralTextEntity) {
 1068     *badPtr = name;
 1069     return 0;
 1070   }
 1071   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, "yes")) {
 1072     if (standalone)
 1073       *standalone = 1;
 1074   }
 1075   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, "no")) {
 1076     if (standalone)
 1077       *standalone = 0;
 1078   }
 1079   else {
 1080     *badPtr = val;
 1081     return 0;
 1082   }
 1083   while (isSpace(toAscii(enc, ptr, end)))
 1084     ptr += enc->minBytesPerChar;
 1085   if (ptr != end) {
 1086     *badPtr = ptr;
 1087     return 0;
 1088   }
 1089   return 1;
 1090 }
 1091 
 1092 static
 1093 int checkCharRefNumber(int result)
 1094 {
 1095   switch (result >> 8) {
 1096   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
 1097   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
 1098     return -1;
 1099   case 0:
 1100     if (latin1_encoding.type[result] == BT_NONXML)
 1101       return -1;
 1102     break;
 1103   case 0xFF:
 1104     if (result == 0xFFFE || result == 0xFFFF)
 1105       return -1;
 1106     break;
 1107   }
 1108   return result;
 1109 }
 1110 
 1111 int XmlUtf8Encode(int c, char *buf)
 1112 {
 1113   enum {
 1114     /* minN is minimum legal resulting value for N byte sequence */
 1115     min2 = 0x80,
 1116     min3 = 0x800,
 1117     min4 = 0x10000
 1118   };
 1119 
 1120   if (c < 0)
 1121     return 0;
 1122   if (c < min2) {
 1123     buf[0] = (c | UTF8_cval1);
 1124     return 1;
 1125   }
 1126   if (c < min3) {
 1127     buf[0] = ((c >> 6) | UTF8_cval2);
 1128     buf[1] = ((c & 0x3f) | 0x80);
 1129     return 2;
 1130   }
 1131   if (c < min4) {
 1132     buf[0] = ((c >> 12) | UTF8_cval3);
 1133     buf[1] = (((c >> 6) & 0x3f) | 0x80);
 1134     buf[2] = ((c & 0x3f) | 0x80);
 1135     return 3;
 1136   }
 1137   if (c < 0x110000) {
 1138     buf[0] = ((c >> 18) | UTF8_cval4);
 1139     buf[1] = (((c >> 12) & 0x3f) | 0x80);
 1140     buf[2] = (((c >> 6) & 0x3f) | 0x80);
 1141     buf[3] = ((c & 0x3f) | 0x80);
 1142     return 4;
 1143   }
 1144   return 0;
 1145 }
 1146 
 1147 int XmlUtf16Encode(int charNum, unsigned short *buf)
 1148 {
 1149   if (charNum < 0)
 1150     return 0;
 1151   if (charNum < 0x10000) {
 1152     buf[0] = charNum;
 1153     return 1;
 1154   }
 1155   if (charNum < 0x110000) {
 1156     charNum -= 0x10000;
 1157     buf[0] = (charNum >> 10) + 0xD800;
 1158     buf[1] = (charNum & 0x3FF) + 0xDC00;
 1159     return 2;
 1160   }
 1161   return 0;
 1162 }
 1163 
 1164 struct unknown_encoding {
 1165   struct normal_encoding normal;
 1166   int (*convert)(void *userData, const char *p);
 1167   void *userData;
 1168   unsigned short utf16[256];
 1169   char utf8[256][4];
 1170 };
 1171 
 1172 int XmlSizeOfUnknownEncoding(void)
 1173 {
 1174   return sizeof(struct unknown_encoding);
 1175 }
 1176 
 1177 static
 1178 int unknown_isName(const ENCODING *enc, const char *p)
 1179 {
 1180   int c = ((const struct unknown_encoding *)enc)
 1181       ->convert(((const struct unknown_encoding *)enc)->userData, p);
 1182   if (c & ~0xFFFF)
 1183     return 0;
 1184   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
 1185 }
 1186 
 1187 static
 1188 int unknown_isNmstrt(const ENCODING *enc, const char *p)
 1189 {
 1190   int c = ((const struct unknown_encoding *)enc)
 1191       ->convert(((const struct unknown_encoding *)enc)->userData, p);
 1192   if (c & ~0xFFFF)
 1193     return 0;
 1194   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
 1195 }
 1196 
 1197 static
 1198 int unknown_isInvalid(const ENCODING *enc, const char *p)
 1199 {
 1200   int c = ((const struct unknown_encoding *)enc)
 1201        ->convert(((const struct unknown_encoding *)enc)->userData, p);
 1202   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
 1203 }
 1204 
 1205 static
 1206 void unknown_toUtf8(const ENCODING *enc,
 1207             const char **fromP, const char *fromLim,
 1208             char **toP, const char *toLim)
 1209 {
 1210   char buf[XML_UTF8_ENCODE_MAX];
 1211   for (;;) {
 1212     const char *utf8;
 1213     int n;
 1214     if (*fromP == fromLim)
 1215       break;
 1216     utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
 1217     n = *utf8++;
 1218     if (n == 0) {
 1219       int c = ((const struct unknown_encoding *)enc)
 1220           ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
 1221       n = XmlUtf8Encode(c, buf);
 1222       if (n > toLim - *toP)
 1223     break;
 1224       utf8 = buf;
 1225       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
 1226              - (BT_LEAD2 - 2);
 1227     }
 1228     else {
 1229       if (n > toLim - *toP)
 1230     break;
 1231       (*fromP)++;
 1232     }
 1233     do {
 1234       *(*toP)++ = *utf8++;
 1235     } while (--n != 0);
 1236   }
 1237 }
 1238 
 1239 static
 1240 void unknown_toUtf16(const ENCODING *enc,
 1241              const char **fromP, const char *fromLim,
 1242              unsigned short **toP, const unsigned short *toLim)
 1243 {
 1244   while (*fromP != fromLim && *toP != toLim) {
 1245     unsigned short c
 1246       = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
 1247     if (c == 0) {
 1248       c = (unsigned short)((const struct unknown_encoding *)enc)
 1249        ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
 1250       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
 1251              - (BT_LEAD2 - 2);
 1252     }
 1253     else
 1254       (*fromP)++;
 1255     *(*toP)++ = c;
 1256   }
 1257 }
 1258 
 1259 ENCODING *
 1260 XmlInitUnknownEncoding(void *mem,
 1261                int *table,
 1262                int (*convert)(void *userData, const char *p),
 1263                void *userData)
 1264 {
 1265   int i;
 1266   struct unknown_encoding *e = mem;
 1267   for (i = 0; i < sizeof(struct normal_encoding); i++)
 1268     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
 1269   for (i = 0; i < 128; i++)
 1270     if (latin1_encoding.type[i] != BT_OTHER
 1271         && latin1_encoding.type[i] != BT_NONXML
 1272     && table[i] != i)
 1273       return 0;
 1274   for (i = 0; i < 256; i++) {
 1275     int c = table[i];
 1276     if (c == -1) {
 1277       e->normal.type[i] = BT_MALFORM;
 1278       /* This shouldn't really get used. */
 1279       e->utf16[i] = 0xFFFF;
 1280       e->utf8[i][0] = 1;
 1281       e->utf8[i][1] = 0;
 1282     }
 1283     else if (c < 0) {
 1284       if (c < -4)
 1285     return 0;
 1286       e->normal.type[i] = BT_LEAD2 - (c + 2);
 1287       e->utf8[i][0] = 0;
 1288       e->utf16[i] = 0;
 1289     }
 1290     else if (c < 0x80) {
 1291       if (latin1_encoding.type[c] != BT_OTHER
 1292       && latin1_encoding.type[c] != BT_NONXML
 1293       && c != i)
 1294     return 0;
 1295       e->normal.type[i] = latin1_encoding.type[c];
 1296       e->utf8[i][0] = 1;
 1297       e->utf8[i][1] = (char)c;
 1298       e->utf16[i] = c == 0 ? 0xFFFF : c;
 1299     }
 1300     else if (checkCharRefNumber(c) < 0) {
 1301       e->normal.type[i] = BT_NONXML;
 1302       /* This shouldn't really get used. */
 1303       e->utf16[i] = 0xFFFF;
 1304       e->utf8[i][0] = 1;
 1305       e->utf8[i][1] = 0;
 1306     }
 1307     else {
 1308       if (c > 0xFFFF)
 1309     return 0;
 1310       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
 1311     e->normal.type[i] = BT_NMSTRT;
 1312       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
 1313     e->normal.type[i] = BT_NAME;
 1314       else
 1315     e->normal.type[i] = BT_OTHER;
 1316       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
 1317       e->utf16[i] = c;
 1318     }
 1319   }
 1320   e->userData = userData;
 1321   e->convert = convert;
 1322   if (convert) {
 1323     e->normal.isName2 = unknown_isName;
 1324     e->normal.isName3 = unknown_isName;
 1325     e->normal.isName4 = unknown_isName;
 1326     e->normal.isNmstrt2 = unknown_isNmstrt;
 1327     e->normal.isNmstrt3 = unknown_isNmstrt;
 1328     e->normal.isNmstrt4 = unknown_isNmstrt;
 1329     e->normal.isInvalid2 = unknown_isInvalid;
 1330     e->normal.isInvalid3 = unknown_isInvalid;
 1331     e->normal.isInvalid4 = unknown_isInvalid;
 1332   }
 1333   e->normal.enc.utf8Convert = unknown_toUtf8;
 1334   e->normal.enc.utf16Convert = unknown_toUtf16;
 1335   return &(e->normal.enc);
 1336 }
 1337 
 1338 /* If this enumeration is changed, getEncodingIndex and encodings
 1339 must also be changed. */
 1340 enum {
 1341   UNKNOWN_ENC = -1,
 1342   ISO_8859_1_ENC = 0,
 1343   US_ASCII_ENC,
 1344   UTF_8_ENC,
 1345   UTF_16_ENC,
 1346   UTF_16BE_ENC,
 1347   UTF_16LE_ENC,
 1348   /* must match encodingNames up to here */
 1349   NO_ENC
 1350 };
 1351 
 1352 static
 1353 int getEncodingIndex(const char *name)
 1354 {
 1355   static const char *encodingNames[] = {
 1356     "ISO-8859-1",
 1357     "US-ASCII",
 1358     "UTF-8",
 1359     "UTF-16",
 1360     "UTF-16BE"
 1361     "UTF-16LE",
 1362   };
 1363   int i;
 1364   if (name == 0)
 1365     return NO_ENC;
 1366   for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
 1367     if (streqci(name, encodingNames[i]))
 1368       return i;
 1369   return UNKNOWN_ENC;
 1370 }
 1371 
 1372 /* For binary compatibility, we store the index of the encoding specified
 1373 at initialization in the isUtf16 member. */
 1374 
 1375 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
 1376 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
 1377 
 1378 /* This is what detects the encoding.
 1379 encodingTable maps from encoding indices to encodings;
 1380 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
 1381 state is XML_CONTENT_STATE if we're parsing an external text entity,
 1382 and XML_PROLOG_STATE otherwise.
 1383 */
 1384 
 1385 
 1386 static
 1387 int initScan(const ENCODING **encodingTable,
 1388          const INIT_ENCODING *enc,
 1389          int state,
 1390          const char *ptr,
 1391          const char *end,
 1392          const char **nextTokPtr)
 1393 {
 1394   const ENCODING **encPtr;
 1395 
 1396   if (ptr == end)
 1397     return XML_TOK_NONE;
 1398   encPtr = enc->encPtr;
 1399   if (ptr + 1 == end) {
 1400     /* only a single byte available for auto-detection */
 1401 #ifndef XML_DTD /* FIXME */
 1402     /* a well-formed document entity must have more than one byte */
 1403     if (state != XML_CONTENT_STATE)
 1404       return XML_TOK_PARTIAL;
 1405 #endif
 1406     /* so we're parsing an external text entity... */
 1407     /* if UTF-16 was externally specified, then we need at least 2 bytes */
 1408     switch (INIT_ENC_INDEX(enc)) {
 1409     case UTF_16_ENC:
 1410     case UTF_16LE_ENC:
 1411     case UTF_16BE_ENC:
 1412       return XML_TOK_PARTIAL;
 1413     }
 1414     switch ((unsigned char)*ptr) {
 1415     case 0xFE:
 1416     case 0xFF:
 1417     case 0xEF: /* possibly first byte of UTF-8 BOM */
 1418       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
 1419       && state == XML_CONTENT_STATE)
 1420     break;
 1421       /* fall through */
 1422     case 0x00:
 1423     case 0x3C:
 1424       return XML_TOK_PARTIAL;
 1425     }
 1426   }
 1427   else {
 1428     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
 1429     case 0xFEFF:
 1430       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
 1431       && state == XML_CONTENT_STATE)
 1432     break;
 1433       *nextTokPtr = ptr + 2;
 1434       *encPtr = encodingTable[UTF_16BE_ENC];
 1435       return XML_TOK_BOM;
 1436     /* 00 3C is handled in the default case */
 1437     case 0x3C00:
 1438       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
 1439        || INIT_ENC_INDEX(enc) == UTF_16_ENC)
 1440       && state == XML_CONTENT_STATE)
 1441     break;
 1442       *encPtr = encodingTable[UTF_16LE_ENC];
 1443       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
 1444     case 0xFFFE:
 1445       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
 1446       && state == XML_CONTENT_STATE)
 1447     break;
 1448       *nextTokPtr = ptr + 2;
 1449       *encPtr = encodingTable[UTF_16LE_ENC];
 1450       return XML_TOK_BOM;
 1451     case 0xEFBB:
 1452       /* Maybe a UTF-8 BOM (EF BB BF) */
 1453       /* If there's an explicitly specified (external) encoding
 1454          of ISO-8859-1 or some flavour of UTF-16
 1455          and this is an external text entity,
 1456      don't look for the BOM,
 1457          because it might be a legal data. */
 1458       if (state == XML_CONTENT_STATE) {
 1459     int e = INIT_ENC_INDEX(enc);
 1460     if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
 1461       break;
 1462       }
 1463       if (ptr + 2 == end)
 1464     return XML_TOK_PARTIAL;
 1465       if ((unsigned char)ptr[2] == 0xBF) {
 1466     *encPtr = encodingTable[UTF_8_ENC];
 1467     return XML_TOK_BOM;
 1468       }
 1469       break;
 1470     default:
 1471       if (ptr[0] == '\0') {
 1472     /* 0 isn't a legal data character. Furthermore a document entity can only
 1473        start with ASCII characters.  So the only way this can fail to be big-endian
 1474        UTF-16 if it it's an external parsed general entity that's labelled as
 1475        UTF-16LE. */
 1476     if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
 1477       break;
 1478     *encPtr = encodingTable[UTF_16BE_ENC];
 1479     return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
 1480       }
 1481       else if (ptr[1] == '\0') {
 1482     /* We could recover here in the case:
 1483         - parsing an external entity
 1484         - second byte is 0
 1485         - no externally specified encoding
 1486         - no encoding declaration
 1487        by assuming UTF-16LE.  But we don't, because this would mean when
 1488        presented just with a single byte, we couldn't reliably determine
 1489        whether we needed further bytes. */
 1490     if (state == XML_CONTENT_STATE)
 1491       break;
 1492     *encPtr = encodingTable[UTF_16LE_ENC];
 1493     return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
 1494       }
 1495       break;
 1496     }
 1497   }
 1498   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
 1499   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
 1500 }
 1501 
 1502 
 1503 #define NS(x) x
 1504 #define ns(x) x
 1505 #include "xmltok_ns.c"
 1506 #undef NS
 1507 #undef ns
 1508 
 1509 #ifdef XML_NS
 1510 
 1511 #define NS(x) x ## NS
 1512 #define ns(x) x ## _ns
 1513 
 1514 #include "xmltok_ns.c"
 1515 
 1516 #undef NS
 1517 #undef ns
 1518 
 1519 ENCODING *
 1520 XmlInitUnknownEncodingNS(void *mem,
 1521                  int *table,
 1522                  int (*convert)(void *userData, const char *p),
 1523                  void *userData)
 1524 {
 1525   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
 1526   if (enc)
 1527     ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
 1528   return enc;
 1529 }
 1530 
 1531 #endif /* XML_NS */