"Fossies" - the Fresh Open Source Software Archive

Member "links-1.04/charsets.c" (22 Feb 2015, 12521 Bytes) of package /linux/www/links-1.04.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charsets.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.03_vs_1.04.

    1 #include "links.h"
    2 
    3 struct table_entry {
    4     unsigned char c;
    5     int u;
    6 };
    7 
    8 struct codepage_desc {
    9     unsigned char *name;
   10     unsigned char **aliases;
   11     struct table_entry *table;
   12 };
   13 
   14 #include "codepage.inc"
   15 #include "uni_7b.inc"
   16 #include "entity.inc"
   17 #include "upcase.inc"
   18 
   19 unsigned char strings[256][2] = {
   20     "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
   21     "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
   22     "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
   23     "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
   24     "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
   25     "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
   26     "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
   27     "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
   28     "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
   29     "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
   30     "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
   31     "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
   32     "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
   33     "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
   34     "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
   35     "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
   36     "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
   37     "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
   38     "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
   39     "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
   40     "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
   41     "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
   42     "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
   43     "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
   44     "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
   45     "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
   46     "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
   47     "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
   48     "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
   49     "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
   50     "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
   51     "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
   52 };
   53 
   54 void free_translation_table(struct conv_table *p)
   55 {
   56     int i;
   57     for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
   58     mem_free(p);
   59 }
   60 
   61 unsigned char no_str[] = "*";
   62 
   63 void new_translation_table(struct conv_table *p)
   64 {
   65     int i;
   66     for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
   67     for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = strings[i];
   68     for (; i < 256; i++) p[i].t = 0, p[i].u.str = no_str;
   69 }
   70 
   71 #define BIN_SEARCH(table, entry, entries, key, result)          \
   72 {                                   \
   73     int s_ = 0, e_ = (entries) - 1;                 \
   74     while (s_ <= e_ || !((result) = -1)) {              \
   75         int m_ = ((unsigned)s_ + (unsigned)e_) / 2;     \
   76         if ((table)[m_].entry == (key)) {           \
   77             (result) = m_;                  \
   78             break;                      \
   79         }                           \
   80         if ((table)[m_].entry > (key)) e_ = m_ - 1;     \
   81         if ((table)[m_].entry < (key)) s_ = m_ + 1;     \
   82     }                               \
   83 }                                   \
   84 
   85 int strange_chars[32] = {
   86 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
   87 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
   88 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
   89 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
   90 };
   91 
   92 static inline unsigned char *u2cp(int u, int to, int fallback)
   93 {
   94     int j, s;
   95     again:
   96     if (u < 128) return strings[u];
   97     if (u == 0xa0) return strings[1];
   98     if (u == 0xad) return strings[0];
   99     if (u < 0xa0) {
  100         u = strange_chars[u - 0x80];
  101         if (!u) return NULL;
  102         goto again;
  103     }
  104     for (j = 0; codepages[to].table[j].c; j++)
  105         if (codepages[to].table[j].u == u)
  106             return strings[codepages[to].table[j].c];
  107     if (!fallback) return NULL;
  108     BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
  109     if (s != -1) return unicode_7b[s].s;
  110     return NULL;
  111 }
  112 
  113 int cp2u(unsigned char ch, int from)
  114 {
  115     struct table_entry *e;
  116     if (from < 0 || ch < 0x80) return ch;
  117     for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
  118     return -1;
  119 }
  120 
  121 unsigned char utf_buffer[7];
  122 
  123 unsigned char *encode_utf_8(int u)
  124 {
  125     memset(utf_buffer, 0, 7);
  126     if (u < 0x80) utf_buffer[0] = u;
  127     else if (u < 0x800)
  128         utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
  129         utf_buffer[1] = 0x80 | (u & 0x3f);
  130     else if (u < 0x10000)
  131         utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
  132         utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
  133         utf_buffer[2] = 0x80 | (u & 0x3f);
  134     else if (u < 0x200000)
  135         utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
  136         utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
  137         utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
  138         utf_buffer[3] = 0x80 | (u & 0x3f);
  139     else if (u < 0x4000000)
  140         utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
  141         utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
  142         utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
  143         utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
  144         utf_buffer[4] = 0x80 | (u & 0x3f);
  145     else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
  146         utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
  147         utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
  148         utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
  149         utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
  150         utf_buffer[5] = 0x80 | (u & 0x3f);
  151     return utf_buffer;
  152 }
  153 
  154 void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
  155 {
  156     unsigned char *p = encode_utf_8(u);
  157     while (p[1]) {
  158         if (ct[*p].t) ct = ct[*p].u.tbl;
  159         else {
  160             struct conv_table *nct;
  161             if (ct[*p].u.str != no_str) {
  162                 internal("bad utf encoding #1");
  163                 return;
  164             }
  165             nct = mem_alloc(sizeof(struct conv_table) * 256);
  166             memset(nct, 0, sizeof(struct conv_table) * 256);
  167             new_translation_table(nct);
  168             ct[*p].t = 1;
  169             ct[*p].u.tbl = nct;
  170             ct = nct;
  171         }
  172         p++;
  173     }
  174     if (ct[*p].t) {
  175         internal("bad utf encoding #2");
  176         return;
  177     }
  178     if (ct[*p].u.str == no_str) ct[*p].u.str = str;
  179 }
  180 
  181 struct conv_table utf_table[256];
  182 int utf_table_init = 1;
  183 
  184 void free_utf_table()
  185 {
  186     int i;
  187     for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
  188 }
  189 
  190 struct conv_table *get_translation_table_to_utf_8(int from)
  191 {
  192     int i;
  193     static int lfr = -1;
  194     if (from == -1) return NULL;
  195     if (from == lfr) return utf_table;
  196     lfr = from;
  197     if (utf_table_init) memset(utf_table, 0, sizeof(struct conv_table) * 256), utf_table_init = 0;
  198     else free_utf_table();
  199     for (i = 0; i < 128; i++) utf_table[i].u.str = strings[i];
  200     if (codepages[from].table == table_utf_8) {
  201         for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
  202         return utf_table;
  203     }
  204     for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
  205     for (i = 0; codepages[from].table[i].c; i++) {
  206         int u = codepages[from].table[i].u;
  207         if (!utf_table[codepages[from].table[i].c].u.str)
  208             utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
  209     }
  210     for (i = 128; i < 256; i++)
  211         if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
  212     return utf_table;
  213 }
  214 
  215 struct conv_table table[256];
  216 static int first = 1;
  217 
  218 void free_conv_table()
  219 {
  220     if (!utf_table_init) free_utf_table();
  221     if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
  222     new_translation_table(table);
  223 }
  224 
  225 struct conv_table *get_translation_table(int from, int to)
  226 {
  227     int i;
  228     static int lfr = -1;
  229     static int lto = -1;
  230     if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
  231     if (/*from == to ||*/ from == -1 || to == -1) return NULL;
  232     if (codepages[to].table == table_utf_8) return get_translation_table_to_utf_8(from);
  233     if (from == lfr && to == lto) return table;
  234     lfr = from; lto = to;
  235     new_translation_table(table);
  236     if (codepages[from].table == table_utf_8) {
  237         int j;
  238         for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, codepages[to].table[j].u == 0xa0 ? strings[1] : codepages[to].table[j].u == 0xad ? strings[0] : strings[codepages[to].table[j].c]);
  239         for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, unicode_7b[i].s);
  240     } else for (i = 128; i < 256; i++) {
  241         int j;
  242         char *u;
  243         for (j = 0; codepages[from].table[j].c; j++) {
  244             if (codepages[from].table[j].c == i) goto f;
  245         }
  246         continue;
  247         f:
  248         u = u2cp(codepages[from].table[j].u, to, 1);
  249         if (u) table[i].u.str = u;
  250     }
  251     return table;
  252 }
  253 
  254 static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
  255 {
  256     while (l2) {
  257         if (*s1 > *s2) return 1;
  258         if (!*s1 || *s1 < *s2) return -1;
  259         s1++, s2++, l2--;
  260     }
  261     return !!*s1;
  262 }
  263 
  264 int get_entity_number(unsigned char *st, int l)
  265 {
  266     int n = 0;
  267     if (upcase(st[0]) == 'X') {
  268         st++, l--;
  269         if (!l) return -1;
  270         do {
  271             char c = upcase(*(st++));
  272             if (c >= '0' && c <= '9') n = n * 16 + c - '0';
  273             else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
  274             else return -1;
  275             if (n >= 0x10000) return -1;
  276         } while (--l);
  277     } else {
  278         if (!l) return -1;
  279         do {
  280             char c = *(st++);
  281             if (c >= '0' && c <= '9') n = n * 10 + c - '0';
  282             else return -1;
  283             if (n >= 0x10000) return -1;
  284         } while (--l);
  285     }
  286     return n;
  287 }
  288 
  289 unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
  290 {
  291     int n;
  292     if (l <= 0) return NULL;
  293     if (st[0] == '#') {
  294         if (l == 1) return NULL;
  295         if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
  296         if (n < 32 && get_attr_val_nl != 2) n = 32;
  297     } else {
  298         int s = 0, e = N_ENTITIES - 1;
  299         while (s <= e) {
  300             int c;
  301             int m = (s + e) / 2;
  302             c = xxstrcmp(entities[m].s, st, l);
  303             if (!c) {
  304                 n = entities[m].c;
  305                 goto f;
  306             }
  307             if (c > 0) e = m - 1;
  308             else s = m + 1;
  309         }
  310         return NULL;
  311         f:;
  312     }
  313 
  314     return u2cp(n, encoding, 1);
  315 }
  316 
  317 unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l)
  318 {
  319     unsigned char *buffer;
  320     int bp = 0;
  321     int pp = 0;
  322     if (!ct) {
  323         int i;
  324         for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
  325         return memacpy(c, l);
  326         xx:;
  327     }
  328     buffer = mem_alloc(ALLOC_GR);
  329     while (pp < l) {
  330         unsigned char *e;
  331         if (c[pp] < 128 && c[pp] != '&') {
  332             put_c:
  333             buffer[bp++] = c[pp++];
  334             if (!(bp & (ALLOC_GR - 1))) {
  335                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  336                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  337             }
  338             continue;
  339         }
  340         if (c[pp] != '&') {
  341             struct conv_table *t;
  342             int i;
  343             if (!ct) goto put_c;
  344             t = ct;
  345             i = pp;
  346             decode:
  347             if (!t[c[i]].t) {
  348                 e = t[c[i]].u.str;
  349             } else {
  350                 t = t[c[i++]].u.tbl;
  351                 if (i >= l) goto put_c;
  352                 goto decode;
  353             }
  354             pp = i + 1;
  355         } else {
  356             int i = pp + 1;
  357             if (d_opt->plain) goto put_c;
  358             while (i < l && c[i] != ';' && c[i] != '&' && c[i] > ' ') i++;
  359             if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, d_opt->cp))) goto put_c;
  360             pp = i + (i < l && c[i] == ';');
  361         }
  362         if (!e[0]) continue;
  363         if (!e[1]) {
  364             buffer[bp++] = e[0];
  365             if (!(bp & (ALLOC_GR - 1))) {
  366                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  367                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  368             }
  369             continue;
  370         }
  371         while (*e) {
  372             buffer[bp++] = *(e++);
  373             if (!(bp & (ALLOC_GR - 1))) {
  374                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  375                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  376             }
  377         }
  378     }
  379     buffer[bp] = 0;
  380     return buffer;
  381 }
  382 
  383 int get_cp_index(unsigned char *n)
  384 {
  385     int i, a, p, q;
  386     int ii = -1, ll = 0;
  387     for (i = 0; codepages[i].name; i++) {
  388         for (a = 0; codepages[i].aliases[a]; a++) {
  389             for (p = 0; n[p]; p++) {
  390                 if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
  391                     for (q = 1; codepages[i].aliases[a][q]; q++) {
  392                         if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
  393                     }
  394                     if (strlen(codepages[i].aliases[a]) > (size_t)ll) {
  395                         ll = strlen(codepages[i].aliases[a]);
  396                         ii = i;
  397                     }
  398                 }
  399                 fail:;
  400             }
  401         }
  402     }
  403     return ii;
  404 }
  405 
  406 unsigned char *get_cp_name(int index)
  407 {
  408     if (index < 0) return "none";
  409     return codepages[index].name;
  410 }
  411 
  412 unsigned char *get_cp_mime_name(int index)
  413 {
  414     if (index < 0) return "none";
  415     if (!codepages[index].aliases) return NULL;
  416     return codepages[index].aliases[0];
  417 }
  418 
  419 int is_cp_special(int index)
  420 {
  421     return codepages[index].table == table_utf_8;
  422 }
  423 
  424 unsigned char charset_upcase(unsigned char ch, int cp)
  425 {
  426     int u, res;
  427     unsigned char *str;
  428     if (ch < 0x80) return upcase(ch);
  429     u = cp2u(ch, cp);
  430     BIN_SEARCH(unicode_upcase, lo, sizeof(unicode_upcase) / sizeof(*unicode_upcase), u, res);
  431     if (res == -1) return ch;
  432     str = u2cp(unicode_upcase[res].up, cp, 0);
  433     if (!str || !str[0] || str[1]) return ch;
  434     return str[0];
  435 }
  436 
  437 void charset_upcase_string(unsigned char **chp, int cp)
  438 {
  439     unsigned char *ch = *chp;
  440     int i;
  441     for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
  442 }
  443