"Fossies" - the Fresh Open Source Software Archive

Member "xterm-379/charclass.c" (4 Jan 2023, 13632 Bytes) of package /linux/misc/xterm-379.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charclass.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 377_vs_379.

    1 /* $XTermId: charclass.c,v 1.46 2023/01/04 09:26:46 tom Exp $ */
    2 
    3 /*
    4  * Copyright 2002-2022,2023 by Thomas E. Dickey
    5  *
    6  *                         All Rights Reserved
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining a
    9  * copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be included
   17  * in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * Except as contained in this notice, the name(s) of the above copyright
   28  * holders shall not be used in advertising or otherwise to promote the
   29  * sale, use or other dealings in this Software without prior written
   30  * authorization.
   31  *
   32  *----------------------------------------------------------------------------
   33  * Compact and efficient reimplementation of the
   34  * xterm character class mechanism for large character sets
   35  *
   36  * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
   37  *
   38  * xterm allows users to select entire words with a double-click on the left
   39  * mouse button.  Opinions might differ on what type of characters are part of
   40  * separate words, therefore xterm allows users to configure a class code for
   41  * each 8-bit character.  Words are maximum length sequences of neighboring
   42  * characters with identical class code.  Extending this mechanism to Unicode
   43  * naively would create an at least 2^16 entries (128 kB) long class code
   44  * table.
   45  *
   46  * Instead, we transform the character class table into a list of intervals,
   47  * that will be accessed via a linear search.  Changes made to the table by the
   48  * user will be appended.  A special class code IDENT (default) marks
   49  * characters who have their code number as the class code.
   50  *
   51  * We could alternatively use a sorted table of non-overlapping intervals that
   52  * can be accessed via binary search, but merging in new intervals is
   53  * significantly more hassle and not worth the effort here.
   54  */
   55 
   56 #include <xterm.h>
   57 #include <charclass.h>
   58 
   59 #if OPT_WIDE_CHARS
   60 
   61 #ifdef TEST_DRIVER
   62 
   63 #include <ctype.h>
   64 #include <wchar.h>
   65 #include <wctype.h>
   66 
   67 #if OPT_TRACE
   68 #define Trace if (opt_v) printf
   69 #endif
   70 
   71 #undef OPT_REPORT_CCLASS
   72 #define OPT_REPORT_CCLASS 1
   73 #endif /* TEST_DRIVER */
   74 
   75 static struct classentry {
   76     int cclass;
   77     int first;
   78     int last;
   79 } *classtab;
   80 
   81 #ifdef TEST_DRIVER
   82 static int opt_all;
   83 static int opt_check;
   84 static int opt_quiet;
   85 static int opt_v;
   86 #endif
   87 
   88 void
   89 init_classtab(void)
   90 {
   91     const int size = 50;
   92 
   93     TRACE(("init_classtab " TRACE_L "\n"));
   94 
   95     classtab = TypeMallocN(struct classentry, (unsigned) size);
   96     if (!classtab)
   97     abort();
   98     classtab[0].cclass = size;
   99     classtab[0].first = 1;
  100     classtab[0].last = 0;
  101 
  102     /* old xterm default classes */
  103     SetCharacterClassRange(0, 0, BLANK);
  104     SetCharacterClassRange(1, 31, CNTRL);
  105     SetCharacterClassRange('\t', '\t', BLANK);
  106     SetCharacterClassRange('0', '9', ALNUM);
  107     SetCharacterClassRange('A', 'Z', ALNUM);
  108     SetCharacterClassRange('_', '_', ALNUM);
  109     SetCharacterClassRange('a', 'z', ALNUM);
  110     SetCharacterClassRange(127, 159, CNTRL);
  111     SetCharacterClassRange(160, 191, IDENT);
  112     SetCharacterClassRange(192, 255, ALNUM);
  113     SetCharacterClassRange(215, 215, IDENT);
  114     SetCharacterClassRange(247, 247, IDENT);
  115 
  116     /* added Unicode classes */
  117     SetCharacterClassRange(0x0100, 0xffdf, ALNUM);  /* mostly characters */
  118     SetCharacterClassRange(0x037e, 0x037e, IDENT);  /* Greek question mark */
  119     SetCharacterClassRange(0x0387, 0x0387, IDENT);  /* Greek ano teleia */
  120     SetCharacterClassRange(0x055a, 0x055f, IDENT);  /* Armenian punctuation */
  121     SetCharacterClassRange(0x0589, 0x0589, IDENT);  /* Armenian full stop */
  122     SetCharacterClassRange(0x0700, 0x070d, IDENT);  /* Syriac punctuation */
  123     SetCharacterClassRange(0x104a, 0x104f, IDENT);  /* Myanmar punctuation */
  124     SetCharacterClassRange(0x10fb, 0x10fb, IDENT);  /* Georgian punctuation */
  125     SetCharacterClassRange(0x1361, 0x1368, IDENT);  /* Ethiopic punctuation */
  126     SetCharacterClassRange(0x166d, 0x166e, IDENT);  /* Canadian Syl. punctuation */
  127     SetCharacterClassRange(0x17d4, 0x17dc, IDENT);  /* Khmer punctuation */
  128     SetCharacterClassRange(0x1800, 0x180a, IDENT);  /* Mongolian punctuation */
  129     SetCharacterClassRange(0x2000, 0x200a, BLANK);  /* spaces */
  130     SetCharacterClassRange(0x200b, 0x200f, CNTRL);  /* formatting */
  131     SetCharacterClassRange(0x2010, 0x27ff, IDENT);  /* punctuation and symbols */
  132     SetCharacterClassRange(0x202a, 0x202e, CNTRL);  /* formatting */
  133     SetCharacterClassRange(0x2060, 0x206f, CNTRL);  /* formatting */
  134     SetCharacterClassRange(0x2070, 0x207f, U_SUP);  /* superscript */
  135     SetCharacterClassRange(0x2080, 0x208f, U_SUB);  /* subscript */
  136     SetCharacterClassRange(0x3000, 0x3000, BLANK);  /* ideographic space */
  137     SetCharacterClassRange(0x3001, 0x3020, IDENT);  /* ideographic punctuation */
  138     SetCharacterClassRange(0x3040, 0x309f, U_HIR);  /* Hiragana */
  139     SetCharacterClassRange(0x30a0, 0x30ff, U_KAT);  /* Katakana */
  140     SetCharacterClassRange(0x3300, 0x9fff, U_CJK);  /* CJK Ideographs */
  141     SetCharacterClassRange(0xac00, 0xd7a3, U_HAN);  /* Hangul Syllables */
  142     SetCharacterClassRange(0xf900, 0xfaff, U_CJK);  /* CJK Ideographs */
  143     SetCharacterClassRange(0xfe30, 0xfe6b, IDENT);  /* punctuation forms */
  144     SetCharacterClassRange(0xfeff, 0xfeff, CNTRL);  /* formatting */
  145     SetCharacterClassRange(0xff00, 0xff0f, IDENT);  /* half/fullwidth ASCII */
  146     SetCharacterClassRange(0xff1a, 0xff20, IDENT);  /* half/fullwidth ASCII */
  147     SetCharacterClassRange(0xff3b, 0xff40, IDENT);  /* half/fullwidth ASCII */
  148     SetCharacterClassRange(0xff5b, 0xff64, IDENT);  /* half/fullwidth ASCII */
  149     SetCharacterClassRange(0xfff9, 0xfffb, CNTRL);  /* formatting */
  150 
  151     TRACE((TRACE_R " init_classtab\n"));
  152     return;
  153 }
  154 
  155 int
  156 CharacterClass(int c)
  157 {
  158     int i, cclass = IDENT;
  159 
  160     for (i = classtab[0].first; i <= classtab[0].last; i++)
  161     if (classtab[i].first <= c && classtab[i].last >= c)
  162         cclass = classtab[i].cclass;
  163 
  164     if (cclass < 0)
  165     cclass = c;
  166 
  167     return cclass;
  168 }
  169 
  170 #if OPT_REPORT_CCLASS
  171 #define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
  172 static const char *
  173 class_name(Classes code)
  174 {
  175     static char buffer[80];
  176     const char *result = "?";
  177     switch (code) {
  178     case ALNUM:
  179     result = "ALNUM";
  180     break;
  181     case BLANK:
  182     result = "BLANK";
  183     break;
  184     case CNTRL:
  185     result = "CNTRL";
  186     break;
  187     case OTHER:
  188     result = "OTHER";
  189     break;
  190     case IDENT:
  191     result = "IDENT";
  192     break;
  193     case U_SUP:
  194     result = "superscript";
  195     break;
  196     case U_SUB:
  197     result = "subscript";
  198     break;
  199     case U_CJK:
  200     result = "CJK Ideographs";
  201     break;
  202     case U_HIR:
  203     result = "Hiragana";
  204     break;
  205     case U_KAT:
  206     result = "Katakana";
  207     break;
  208     case U_HAN:
  209     result = "Hangul Syllables";
  210     break;
  211     default:
  212     sprintf(buffer, charFormat(code), code);
  213     result = buffer;
  214     break;
  215     }
  216     return result;
  217 }
  218 
  219 /*
  220  * Special convention for classtab[0]:
  221  * - classtab[0].cclass is the allocated number of entries in classtab
  222  * - classtab[0].first = 1 (first used entry in classtab)
  223  * - classtab[0].last is the last used entry in classtab
  224  */
  225 
  226 int
  227 SetCharacterClassRange(int low, int high, int value)
  228 {
  229     TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n",
  230        low, high, class_name(value)));
  231 
  232     if (high < low)
  233     return -1;      /* nothing to do */
  234 
  235     /* make sure we have at least one free entry left at table end */
  236     if (classtab[0].last > classtab[0].cclass - 2) {
  237     classtab[0].cclass += 5 + classtab[0].cclass / 4;
  238     classtab = TypeRealloc(struct classentry,
  239                      (unsigned) classtab[0].cclass, classtab);
  240     if (!classtab)
  241         abort();
  242     }
  243 
  244     /* simply append new interval to end of interval array */
  245     classtab[0].last++;
  246     classtab[classtab[0].last].first = low;
  247     classtab[classtab[0].last].last = high;
  248     classtab[classtab[0].last].cclass = value;
  249 
  250     return 0;
  251 }
  252 
  253 void
  254 report_wide_char_class(void)
  255 {
  256     static const Classes known_classes[] =
  257     {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
  258     int i;
  259 
  260     printf("\n");
  261     printf("Unicode charClass data uses the last match\n");
  262     printf("from these overlapping intervals of character codes:\n");
  263     for (i = classtab[0].first; i <= classtab[0].last; i++) {
  264     printf("\tU+%04X .. U+%04X %s\n",
  265            classtab[i].first,
  266            classtab[i].last,
  267            class_name((Classes) classtab[i].cclass));
  268     }
  269     printf("\n");
  270     printf("These class-names are used internally (the first character code in a class):\n");
  271     for (i = 0; i < (int) XtNumber(known_classes); ++i) {
  272     printf("\t");
  273     printf(charFormat(known_classes[i]), known_classes[i]);
  274     printf(" = %s\n", class_name(known_classes[i]));
  275     }
  276 }
  277 #endif /* OPT_REPORT_CCLASS */
  278 
  279 #ifdef NO_LEAKS
  280 void
  281 noleaks_CharacterClass(void)
  282 {
  283     FreeAndNull(classtab);
  284 }
  285 #endif
  286 #endif /* OPT_WIDE_CHARS */
  287 
  288 #ifdef TEST_DRIVER
  289 #if OPT_WIDE_CHARS
  290 static void
  291 usage(void)
  292 {
  293     static const char *msg[] =
  294     {
  295     "Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]",
  296     "",
  297     "Options:",
  298     " -a  show all data",
  299     " -s  show only summary",
  300     " -v  verbose"
  301     };
  302     size_t n;
  303     for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
  304     fprintf(stderr, "%s\n", msg[n]);
  305     }
  306     exit(EXIT_FAILURE);
  307 }
  308 
  309 static int
  310 expected_class(int wch)
  311 {
  312     int result = wch;
  313     wint_t ch = (wint_t) wch;
  314     if (ch == '\0' || ch == '\t') {
  315     result = BLANK;
  316     } else if (iswcntrl(ch)) {
  317     result = CNTRL;
  318     } else if (iswspace(ch)) {
  319     result = BLANK;
  320     } else if (ch < 127) {
  321     if (isalnum(ch) || ch == '_') {
  322         result = ALNUM;
  323     }
  324     } else if (ch == 170 || ch == 181 || ch == 186) {
  325     ;
  326     } else if (iswalnum(ch)) {
  327     result = ALNUM;
  328     }
  329     return result;
  330 }
  331 
  332 static int
  333 show_cclass_range(int lo, int hi)
  334 {
  335     int cclass = CharacterClass(lo);
  336     int ident = (cclass == lo);
  337     int more = 0;
  338     if (ident) {
  339     int ch;
  340     for (ch = lo + 1; ch <= hi; ch++) {
  341         if (CharacterClass(ch) != ch) {
  342         ident = 0;
  343         break;
  344         }
  345     }
  346     if (ident && (hi < 255)) {
  347         ch = hi + 1;
  348         if (CharacterClass(ch) == ch) {
  349         if (ch >= 255 || CharacterClass(ch + 1) != ch) {
  350             more = 1;
  351         }
  352         }
  353     }
  354     }
  355     if (!more) {
  356     if (lo == hi) {
  357         printf("\t%d", lo);
  358     } else {
  359         printf("\t%d-%d", lo, hi);
  360     }
  361     if (!ident)
  362         printf(":%d", cclass);
  363     if (hi < 255)
  364         printf(", \\");
  365     printf("\n");
  366     }
  367     return !more;
  368 }
  369 
  370 static void
  371 report_resource(int first, int last)
  372 {
  373     int class_p;
  374     int ch;
  375     int dh;
  376 
  377     class_p = CharacterClass(dh = first);
  378     for (ch = first; ch < last; ++ch) {
  379     int class_c = CharacterClass(ch);
  380     if (class_c != class_p) {
  381         if (show_cclass_range(dh, ch - 1)) {
  382         dh = ch;
  383         class_p = class_c;
  384         }
  385     }
  386     }
  387     if (dh < last - 1) {
  388     show_cclass_range(dh, last - 1);
  389     }
  390 }
  391 
  392 static int
  393 decode_one(const char *source, char **target)
  394 {
  395     int result = -1;
  396     long check;
  397     int radix = 0;
  398     if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
  399     source += 2;
  400     radix = 16;
  401     }
  402     check = strtol(source, target, radix);
  403     if (*target != NULL && *target != source)
  404     result = (int) check;
  405     return result;
  406 }
  407 
  408 static int
  409 decode_range(const char *source, int *lo, int *hi)
  410 {
  411     int result = 0;
  412     char *after1;
  413     char *after2;
  414     if ((*lo = decode_one(source, &after1)) >= 0) {
  415     after1 += strspn(after1, ":-.\t ");
  416     if ((*hi = decode_one(after1, &after2)) < 0) {
  417         *hi = *lo;
  418     }
  419     result = 1;
  420     }
  421     return result;
  422 }
  423 
  424 static void
  425 do_range(const char *source)
  426 {
  427     int lo, hi;
  428     if (decode_range(source, &lo, &hi)) {
  429     if (opt_all) {
  430         while (lo <= hi) {
  431         int other_rc = CharacterClass(lo);
  432         if (!opt_quiet)
  433             printf("U+%04X\t%s\n", lo, class_name(other_rc));
  434         ++lo;
  435         }
  436     } else if (opt_check) {
  437         while (lo <= hi) {
  438         int expect = expected_class(lo);
  439         int actual = CharacterClass(lo);
  440         if (actual != expect)
  441             printf("U+%04X\t%s ->%s\n", lo,
  442                class_name(expect),
  443                class_name(actual));
  444         ++lo;
  445         }
  446     } else {
  447         printf("\"charClass\" resource for [%d..%d]:\n", lo, hi);
  448         report_resource(lo, hi + 1);
  449     }
  450     }
  451 }
  452 #endif /* OPT_WIDE_CHARS */
  453 
  454 /*
  455  * TODO: add option to show do_range in hex
  456  */
  457 int
  458 main(int argc, char **argv ENVP_ARG)
  459 {
  460 #if OPT_WIDE_CHARS
  461     int ch;
  462 #endif
  463 
  464     (void) argc;
  465     (void) argv;
  466 
  467 #if OPT_WIDE_CHARS
  468     setlocale(LC_ALL, "");
  469     while ((ch = getopt(argc, argv, "acsv")) != -1) {
  470     switch (ch) {
  471     case 'a':
  472         opt_all = 1;
  473         break;
  474     case 'c':
  475         opt_check = 1;
  476         break;
  477     case 's':
  478         opt_quiet = 1;
  479         break;
  480     case 'v':
  481         opt_v = 1;
  482         break;
  483     default:
  484         usage();
  485     }
  486     }
  487     init_classtab();
  488 
  489     if (optind >= argc) {
  490     do_range("0-255");
  491     } else {
  492     while (optind < argc) {
  493         do_range(argv[optind++]);
  494     }
  495     }
  496     report_wide_char_class();
  497 #else
  498     printf("wide-character support is not configured\n");
  499 #endif /* OPT_WIDE_CHARS */
  500     return 0;
  501 }
  502 #endif /* TEST_DRIVER */