"Fossies" - the Fresh Open Source Software Archive

Member "links-1.03/charsets.c" (16 Nov 2011, 12541 Bytes) of archive /linux/www/links-1.03.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charsets.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.8_vs_1.03.

    1 #include "links.h"
    2 
    3 struct table_entry {
    4     unsigned char c;
    5     int u;
    6 };
    7 
    8 struct codepage_desc {
    9     unsigned char *name;
   10     unsigned char **aliases;
   11     struct table_entry *table;
   12 };
   13 
   14 #include "codepage.inc"
   15 #include "uni_7b.inc"
   16 #include "entity.inc"
   17 #include "upcase.inc"
   18 
   19 char strings[256][2] = {
   20     "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
   21     "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
   22     "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
   23     "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
   24     "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
   25     "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
   26     "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
   27     "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
   28     "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
   29     "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
   30     "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
   31     "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
   32     "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
   33     "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
   34     "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
   35     "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
   36     "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
   37     "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
   38     "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
   39     "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
   40     "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
   41     "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
   42     "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
   43     "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
   44     "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
   45     "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
   46     "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
   47     "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
   48     "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
   49     "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
   50     "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
   51     "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
   52 };
   53 
   54 void free_translation_table(struct conv_table *p)
   55 {
   56     int i;
   57     for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
   58     mem_free(p);
   59 }
   60 
   61 unsigned char *no_str = NULL;
   62 
   63 void new_translation_table(struct conv_table *p)
   64 {
   65     int i;
   66     for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
   67     for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = strings[i];
   68     for (; i < 256; i++) p[i].t = 0, p[i].u.str = no_str;
   69 }
   70 
   71 #define BIN_SEARCH(table, entry, entries, key, result)          \
   72 {                                   \
   73     int _s = 0, _e = (entries) - 1;                 \
   74     while (_s <= _e || !((result) = -1)) {              \
   75         int _m = (_s + _e) / 2;                 \
   76         if ((table)[_m].entry == (key)) {           \
   77             (result) = _m;                  \
   78             break;                      \
   79         }                           \
   80         if ((table)[_m].entry > (key)) _e = _m - 1;     \
   81         if ((table)[_m].entry < (key)) _s = _m + 1;     \
   82     }                               \
   83 }                                   \
   84 
   85 int strange_chars[32] = {
   86 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
   87 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
   88 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
   89 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
   90 };
   91 
   92 static inline unsigned char *u2cp(int u, int to, int fallback)
   93 {
   94     int j, s;
   95     again:
   96     if (u < 128) return strings[u];
   97     if (u == 0xa0) return "\001";
   98     if (u == 0xad) return "";
   99     if (u < 0xa0) {
  100         u = strange_chars[u - 0x80];
  101         if (!u) return NULL;
  102         goto again;
  103     }
  104     for (j = 0; codepages[to].table[j].c; j++)
  105         if (codepages[to].table[j].u == u)
  106             return strings[codepages[to].table[j].c];
  107     if (!fallback) return NULL;
  108     BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
  109     if (s != -1) return unicode_7b[s].s;
  110     return NULL;
  111 }
  112 
  113 int cp2u(unsigned char ch, int from)
  114 {
  115     struct table_entry *e;
  116     if (from < 0 || ch < 0x80) return ch;
  117     for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
  118     return -1;
  119 }
  120 
  121 unsigned char utf_buffer[7];
  122 
  123 unsigned char *encode_utf_8(int u)
  124 {
  125     memset(utf_buffer, 0, 7);
  126     if (u < 0x80) utf_buffer[0] = u;
  127     else if (u < 0x800)
  128         utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
  129         utf_buffer[1] = 0x80 | (u & 0x3f);
  130     else if (u < 0x10000)
  131         utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
  132         utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
  133         utf_buffer[2] = 0x80 | (u & 0x3f);
  134     else if (u < 0x200000)
  135         utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
  136         utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
  137         utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
  138         utf_buffer[3] = 0x80 | (u & 0x3f);
  139     else if (u < 0x4000000)
  140         utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
  141         utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
  142         utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
  143         utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
  144         utf_buffer[4] = 0x80 | (u & 0x3f);
  145     else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
  146         utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
  147         utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
  148         utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
  149         utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
  150         utf_buffer[5] = 0x80 | (u & 0x3f);
  151     return utf_buffer;
  152 }
  153 
  154 void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
  155 {
  156     unsigned char *p = encode_utf_8(u);
  157     while (p[1]) {
  158         if (ct[*p].t) ct = ct[*p].u.tbl;
  159         else {
  160             struct conv_table *nct;
  161             if (ct[*p].u.str != no_str) {
  162                 internal("bad utf encoding #1");
  163                 return;
  164             }
  165             nct = mem_alloc(sizeof(struct conv_table) * 256);
  166             memset(nct, 0, sizeof(struct conv_table) * 256);
  167             new_translation_table(nct);
  168             ct[*p].t = 1;
  169             ct[*p].u.tbl = nct;
  170             ct = nct;
  171         }
  172         p++;
  173     }
  174     if (ct[*p].t) {
  175         internal("bad utf encoding #2");
  176         return;
  177     }
  178     if (ct[*p].u.str == no_str) ct[*p].u.str = str;
  179 }
  180 
  181 struct conv_table utf_table[256];
  182 int utf_table_init = 1;
  183 
  184 void free_utf_table()
  185 {
  186     int i;
  187     for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
  188 }
  189 
  190 struct conv_table *get_translation_table_to_utf_8(int from)
  191 {
  192     int i;
  193     static int lfr = -1;
  194     if (from == -1) return NULL;
  195     if (from == lfr) return utf_table;
  196     if (utf_table_init) memset(utf_table, 0, sizeof(struct conv_table) * 256), utf_table_init = 0;
  197     else free_utf_table();
  198     for (i = 0; i < 128; i++) utf_table[i].u.str = strings[i];
  199     if (codepages[from].table == table_utf_8) {
  200         for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
  201         return utf_table;
  202     }
  203     for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
  204     for (i = 0; codepages[from].table[i].c; i++) {
  205         int u = codepages[from].table[i].u;
  206         if (!utf_table[codepages[from].table[i].c].u.str)
  207             utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
  208     }
  209     for (i = 128; i < 256; i++)
  210         if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
  211     return utf_table;
  212 }
  213 
  214 struct conv_table table[256];
  215 static int first = 1;
  216 
  217 void free_conv_table()
  218 {
  219     if (!utf_table_init) free_utf_table();
  220     if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
  221     new_translation_table(table);
  222     if (no_str) mem_free(no_str), no_str = NULL;
  223 }
  224 
  225 struct conv_table *get_translation_table(int from, int to)
  226 {
  227     int i;
  228     static int lfr = -1;
  229     static int lto = -1;
  230     if (!no_str) no_str = stracpy("*");
  231     if (first) memset(table, 0, sizeof(struct conv_table) * 256), first = 0;
  232     if (/*from == to ||*/ from == -1 || to == -1) return NULL;
  233     if (codepages[to].table == table_utf_8) return get_translation_table_to_utf_8(from);
  234     if (from == lfr && to == lto) return table;
  235     lfr = from; lto = to;
  236     new_translation_table(table);
  237     if (codepages[from].table == table_utf_8) {
  238         int j;
  239         for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, codepages[to].table[j].u == 0xa0 ? "\001" : codepages[to].table[j].u == 0xad ? "" : strings[codepages[to].table[j].c]);
  240         for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, unicode_7b[i].s);
  241     } else for (i = 128; i < 256; i++) {
  242         int j;
  243         char *u;
  244         for (j = 0; codepages[from].table[j].c; j++) {
  245             if (codepages[from].table[j].c == i) goto f;
  246         }
  247         continue;
  248         f:
  249         u = u2cp(codepages[from].table[j].u, to, 1);
  250         if (u) table[i].u.str = u;
  251     }
  252     return table;
  253 }
  254 
  255 static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
  256 {
  257     while (l2) {
  258         if (*s1 > *s2) return 1;
  259         if (!*s1 || *s1 < *s2) return -1;
  260         s1++, s2++, l2--;
  261     }
  262     return !!*s1;
  263 }
  264 
  265 int get_entity_number(unsigned char *st, int l)
  266 {
  267     int n = 0;
  268     if (upcase(st[0]) == 'X') {
  269         st++, l--;
  270         if (!l) return -1;
  271         do {
  272             char c = upcase(*(st++));
  273             if (c >= '0' && c <= '9') n = n * 16 + c - '0';
  274             else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
  275             else return -1;
  276             if (n >= 0x10000) return -1;
  277         } while (--l);
  278     } else {
  279         if (!l) return -1;
  280         do {
  281             char c = *(st++);
  282             if (c >= '0' && c <= '9') n = n * 10 + c - '0';
  283             else return -1;
  284             if (n >= 0x10000) return -1;
  285         } while (--l);
  286     }
  287     return n;
  288 }
  289 
  290 unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
  291 {
  292     int n;
  293     if (l <= 0) return NULL;
  294     if (st[0] == '#') {
  295         if (l == 1) return NULL;
  296         if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
  297         if (n < 32 && get_attr_val_nl != 2) n = 32;
  298     } else {
  299         int s = 0, e = N_ENTITIES - 1;
  300         while (s <= e) {
  301             int c;
  302             int m = (s + e) / 2;
  303             c = xxstrcmp(entities[m].s, st, l);
  304             if (!c) {
  305                 n = entities[m].c;
  306                 goto f;
  307             }
  308             if (c > 0) e = m - 1;
  309             else s = m + 1;
  310         }
  311         return NULL;
  312         f:;
  313     }
  314 
  315     return u2cp(n, encoding, 1);
  316 }
  317 
  318 unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l)
  319 {
  320     unsigned char *buffer;
  321     int bp = 0;
  322     int pp = 0;
  323     if (!ct) {
  324         int i;
  325         for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
  326         return memacpy(c, l);
  327         xx:;
  328     }
  329     buffer = mem_alloc(ALLOC_GR);
  330     while (pp < l) {
  331         unsigned char *e;
  332         if (c[pp] < 128 && c[pp] != '&') {
  333             put_c:
  334             buffer[bp++] = c[pp++];
  335             if (!(bp & (ALLOC_GR - 1))) {
  336                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  337                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  338             }
  339             continue;
  340         }
  341         if (c[pp] != '&') {
  342             struct conv_table *t;
  343             int i;
  344             if (!ct) goto put_c;
  345             t = ct;
  346             i = pp;
  347             decode:
  348             if (!t[c[i]].t) {
  349                 e = t[c[i]].u.str;
  350             } else {
  351                 t = t[c[i++]].u.tbl;
  352                 if (i >= l) goto put_c;
  353                 goto decode;
  354             }
  355             pp = i + 1;
  356         } else {
  357             int i = pp + 1;
  358             if (d_opt->plain) goto put_c;
  359             while (i < l && c[i] != ';' && c[i] != '&' && c[i] > ' ') i++;
  360             if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, d_opt->cp))) goto put_c;
  361             pp = i + (i < l && c[i] == ';');
  362         }
  363         if (!e[0]) continue;
  364         if (!e[1]) {
  365             buffer[bp++] = e[0];
  366             if (!(bp & (ALLOC_GR - 1))) {
  367                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  368                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  369             }
  370             continue;
  371         }
  372         while (*e) {
  373             buffer[bp++] = *(e++);
  374             if (!(bp & (ALLOC_GR - 1))) {
  375                 if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
  376                 buffer = mem_realloc(buffer, bp + ALLOC_GR);
  377             }
  378         }
  379     }
  380     buffer[bp] = 0;
  381     return buffer;
  382 }
  383 
  384 int get_cp_index(unsigned char *n)
  385 {
  386     int i, a, p, q;
  387     int ii = -1, ll = 0;
  388     for (i = 0; codepages[i].name; i++) {
  389         for (a = 0; codepages[i].aliases[a]; a++) {
  390             for (p = 0; n[p]; p++) {
  391                 if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
  392                     for (q = 1; codepages[i].aliases[a][q]; q++) {
  393                         if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
  394                     }
  395                     if (strlen(codepages[i].aliases[a]) > (size_t)ll) {
  396                         ll = strlen(codepages[i].aliases[a]);
  397                         ii = i;
  398                     }
  399                 }
  400                 fail:;
  401             }
  402         }
  403     }
  404     return ii;
  405 }
  406 
  407 unsigned char *get_cp_name(int index)
  408 {
  409     if (index < 0) return "none";
  410     return codepages[index].name;
  411 }
  412 
  413 unsigned char *get_cp_mime_name(int index)
  414 {
  415     if (index < 0) return "none";
  416     if (!codepages[index].aliases) return NULL;
  417     return codepages[index].aliases[0];
  418 }
  419 
  420 int is_cp_special(int index)
  421 {
  422     return codepages[index].table == table_utf_8;
  423 }
  424 
  425 unsigned char charset_upcase(unsigned char ch, int cp)
  426 {
  427     int u, res;
  428     unsigned char *str;
  429     if (ch < 0x80) return upcase(ch);
  430     u = cp2u(ch, cp);
  431     BIN_SEARCH(unicode_upcase, lo, sizeof(unicode_upcase) / sizeof(*unicode_upcase), u, res);
  432     if (res == -1) return ch;
  433     str = u2cp(unicode_upcase[res].up, cp, 0);
  434     if (!str || !str[0] || str[1]) return ch;
  435     return str[0];
  436 }
  437 
  438 void charset_upcase_string(unsigned char **chp, int cp)
  439 {
  440     unsigned char *ch = *chp;
  441     int i;
  442     for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
  443 }
  444