"Fossies" - the Fresh Open Source Software Archive 
Member "xterm-379/charclass.c" (4 Jan 2023, 13632 Bytes) of package /linux/misc/xterm-379.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "charclass.c" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
377_vs_379.
1 /* $XTermId: charclass.c,v 1.46 2023/01/04 09:26:46 tom Exp $ */
2
3 /*
4 * Copyright 2002-2022,2023 by Thomas E. Dickey
5 *
6 * All Rights Reserved
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Except as contained in this notice, the name(s) of the above copyright
28 * holders shall not be used in advertising or otherwise to promote the
29 * sale, use or other dealings in this Software without prior written
30 * authorization.
31 *
32 *----------------------------------------------------------------------------
33 * Compact and efficient reimplementation of the
34 * xterm character class mechanism for large character sets
35 *
36 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
37 *
38 * xterm allows users to select entire words with a double-click on the left
39 * mouse button. Opinions might differ on what type of characters are part of
40 * separate words, therefore xterm allows users to configure a class code for
41 * each 8-bit character. Words are maximum length sequences of neighboring
42 * characters with identical class code. Extending this mechanism to Unicode
43 * naively would create an at least 2^16 entries (128 kB) long class code
44 * table.
45 *
46 * Instead, we transform the character class table into a list of intervals,
47 * that will be accessed via a linear search. Changes made to the table by the
48 * user will be appended. A special class code IDENT (default) marks
49 * characters who have their code number as the class code.
50 *
51 * We could alternatively use a sorted table of non-overlapping intervals that
52 * can be accessed via binary search, but merging in new intervals is
53 * significantly more hassle and not worth the effort here.
54 */
55
56 #include <xterm.h>
57 #include <charclass.h>
58
59 #if OPT_WIDE_CHARS
60
61 #ifdef TEST_DRIVER
62
63 #include <ctype.h>
64 #include <wchar.h>
65 #include <wctype.h>
66
67 #if OPT_TRACE
68 #define Trace if (opt_v) printf
69 #endif
70
71 #undef OPT_REPORT_CCLASS
72 #define OPT_REPORT_CCLASS 1
73 #endif /* TEST_DRIVER */
74
75 static struct classentry {
76 int cclass;
77 int first;
78 int last;
79 } *classtab;
80
81 #ifdef TEST_DRIVER
82 static int opt_all;
83 static int opt_check;
84 static int opt_quiet;
85 static int opt_v;
86 #endif
87
88 void
89 init_classtab(void)
90 {
91 const int size = 50;
92
93 TRACE(("init_classtab " TRACE_L "\n"));
94
95 classtab = TypeMallocN(struct classentry, (unsigned) size);
96 if (!classtab)
97 abort();
98 classtab[0].cclass = size;
99 classtab[0].first = 1;
100 classtab[0].last = 0;
101
102 /* old xterm default classes */
103 SetCharacterClassRange(0, 0, BLANK);
104 SetCharacterClassRange(1, 31, CNTRL);
105 SetCharacterClassRange('\t', '\t', BLANK);
106 SetCharacterClassRange('0', '9', ALNUM);
107 SetCharacterClassRange('A', 'Z', ALNUM);
108 SetCharacterClassRange('_', '_', ALNUM);
109 SetCharacterClassRange('a', 'z', ALNUM);
110 SetCharacterClassRange(127, 159, CNTRL);
111 SetCharacterClassRange(160, 191, IDENT);
112 SetCharacterClassRange(192, 255, ALNUM);
113 SetCharacterClassRange(215, 215, IDENT);
114 SetCharacterClassRange(247, 247, IDENT);
115
116 /* added Unicode classes */
117 SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */
118 SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */
119 SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */
120 SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */
121 SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */
122 SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */
123 SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */
124 SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */
125 SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */
126 SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */
127 SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */
128 SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */
129 SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */
130 SetCharacterClassRange(0x200b, 0x200f, CNTRL); /* formatting */
131 SetCharacterClassRange(0x2010, 0x27ff, IDENT); /* punctuation and symbols */
132 SetCharacterClassRange(0x202a, 0x202e, CNTRL); /* formatting */
133 SetCharacterClassRange(0x2060, 0x206f, CNTRL); /* formatting */
134 SetCharacterClassRange(0x2070, 0x207f, U_SUP); /* superscript */
135 SetCharacterClassRange(0x2080, 0x208f, U_SUB); /* subscript */
136 SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */
137 SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */
138 SetCharacterClassRange(0x3040, 0x309f, U_HIR); /* Hiragana */
139 SetCharacterClassRange(0x30a0, 0x30ff, U_KAT); /* Katakana */
140 SetCharacterClassRange(0x3300, 0x9fff, U_CJK); /* CJK Ideographs */
141 SetCharacterClassRange(0xac00, 0xd7a3, U_HAN); /* Hangul Syllables */
142 SetCharacterClassRange(0xf900, 0xfaff, U_CJK); /* CJK Ideographs */
143 SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */
144 SetCharacterClassRange(0xfeff, 0xfeff, CNTRL); /* formatting */
145 SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */
146 SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */
147 SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */
148 SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */
149 SetCharacterClassRange(0xfff9, 0xfffb, CNTRL); /* formatting */
150
151 TRACE((TRACE_R " init_classtab\n"));
152 return;
153 }
154
155 int
156 CharacterClass(int c)
157 {
158 int i, cclass = IDENT;
159
160 for (i = classtab[0].first; i <= classtab[0].last; i++)
161 if (classtab[i].first <= c && classtab[i].last >= c)
162 cclass = classtab[i].cclass;
163
164 if (cclass < 0)
165 cclass = c;
166
167 return cclass;
168 }
169
170 #if OPT_REPORT_CCLASS
171 #define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
172 static const char *
173 class_name(Classes code)
174 {
175 static char buffer[80];
176 const char *result = "?";
177 switch (code) {
178 case ALNUM:
179 result = "ALNUM";
180 break;
181 case BLANK:
182 result = "BLANK";
183 break;
184 case CNTRL:
185 result = "CNTRL";
186 break;
187 case OTHER:
188 result = "OTHER";
189 break;
190 case IDENT:
191 result = "IDENT";
192 break;
193 case U_SUP:
194 result = "superscript";
195 break;
196 case U_SUB:
197 result = "subscript";
198 break;
199 case U_CJK:
200 result = "CJK Ideographs";
201 break;
202 case U_HIR:
203 result = "Hiragana";
204 break;
205 case U_KAT:
206 result = "Katakana";
207 break;
208 case U_HAN:
209 result = "Hangul Syllables";
210 break;
211 default:
212 sprintf(buffer, charFormat(code), code);
213 result = buffer;
214 break;
215 }
216 return result;
217 }
218
219 /*
220 * Special convention for classtab[0]:
221 * - classtab[0].cclass is the allocated number of entries in classtab
222 * - classtab[0].first = 1 (first used entry in classtab)
223 * - classtab[0].last is the last used entry in classtab
224 */
225
226 int
227 SetCharacterClassRange(int low, int high, int value)
228 {
229 TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n",
230 low, high, class_name(value)));
231
232 if (high < low)
233 return -1; /* nothing to do */
234
235 /* make sure we have at least one free entry left at table end */
236 if (classtab[0].last > classtab[0].cclass - 2) {
237 classtab[0].cclass += 5 + classtab[0].cclass / 4;
238 classtab = TypeRealloc(struct classentry,
239 (unsigned) classtab[0].cclass, classtab);
240 if (!classtab)
241 abort();
242 }
243
244 /* simply append new interval to end of interval array */
245 classtab[0].last++;
246 classtab[classtab[0].last].first = low;
247 classtab[classtab[0].last].last = high;
248 classtab[classtab[0].last].cclass = value;
249
250 return 0;
251 }
252
253 void
254 report_wide_char_class(void)
255 {
256 static const Classes known_classes[] =
257 {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
258 int i;
259
260 printf("\n");
261 printf("Unicode charClass data uses the last match\n");
262 printf("from these overlapping intervals of character codes:\n");
263 for (i = classtab[0].first; i <= classtab[0].last; i++) {
264 printf("\tU+%04X .. U+%04X %s\n",
265 classtab[i].first,
266 classtab[i].last,
267 class_name((Classes) classtab[i].cclass));
268 }
269 printf("\n");
270 printf("These class-names are used internally (the first character code in a class):\n");
271 for (i = 0; i < (int) XtNumber(known_classes); ++i) {
272 printf("\t");
273 printf(charFormat(known_classes[i]), known_classes[i]);
274 printf(" = %s\n", class_name(known_classes[i]));
275 }
276 }
277 #endif /* OPT_REPORT_CCLASS */
278
279 #ifdef NO_LEAKS
280 void
281 noleaks_CharacterClass(void)
282 {
283 FreeAndNull(classtab);
284 }
285 #endif
286 #endif /* OPT_WIDE_CHARS */
287
288 #ifdef TEST_DRIVER
289 #if OPT_WIDE_CHARS
290 static void
291 usage(void)
292 {
293 static const char *msg[] =
294 {
295 "Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]",
296 "",
297 "Options:",
298 " -a show all data",
299 " -s show only summary",
300 " -v verbose"
301 };
302 size_t n;
303 for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
304 fprintf(stderr, "%s\n", msg[n]);
305 }
306 exit(EXIT_FAILURE);
307 }
308
309 static int
310 expected_class(int wch)
311 {
312 int result = wch;
313 wint_t ch = (wint_t) wch;
314 if (ch == '\0' || ch == '\t') {
315 result = BLANK;
316 } else if (iswcntrl(ch)) {
317 result = CNTRL;
318 } else if (iswspace(ch)) {
319 result = BLANK;
320 } else if (ch < 127) {
321 if (isalnum(ch) || ch == '_') {
322 result = ALNUM;
323 }
324 } else if (ch == 170 || ch == 181 || ch == 186) {
325 ;
326 } else if (iswalnum(ch)) {
327 result = ALNUM;
328 }
329 return result;
330 }
331
332 static int
333 show_cclass_range(int lo, int hi)
334 {
335 int cclass = CharacterClass(lo);
336 int ident = (cclass == lo);
337 int more = 0;
338 if (ident) {
339 int ch;
340 for (ch = lo + 1; ch <= hi; ch++) {
341 if (CharacterClass(ch) != ch) {
342 ident = 0;
343 break;
344 }
345 }
346 if (ident && (hi < 255)) {
347 ch = hi + 1;
348 if (CharacterClass(ch) == ch) {
349 if (ch >= 255 || CharacterClass(ch + 1) != ch) {
350 more = 1;
351 }
352 }
353 }
354 }
355 if (!more) {
356 if (lo == hi) {
357 printf("\t%d", lo);
358 } else {
359 printf("\t%d-%d", lo, hi);
360 }
361 if (!ident)
362 printf(":%d", cclass);
363 if (hi < 255)
364 printf(", \\");
365 printf("\n");
366 }
367 return !more;
368 }
369
370 static void
371 report_resource(int first, int last)
372 {
373 int class_p;
374 int ch;
375 int dh;
376
377 class_p = CharacterClass(dh = first);
378 for (ch = first; ch < last; ++ch) {
379 int class_c = CharacterClass(ch);
380 if (class_c != class_p) {
381 if (show_cclass_range(dh, ch - 1)) {
382 dh = ch;
383 class_p = class_c;
384 }
385 }
386 }
387 if (dh < last - 1) {
388 show_cclass_range(dh, last - 1);
389 }
390 }
391
392 static int
393 decode_one(const char *source, char **target)
394 {
395 int result = -1;
396 long check;
397 int radix = 0;
398 if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
399 source += 2;
400 radix = 16;
401 }
402 check = strtol(source, target, radix);
403 if (*target != NULL && *target != source)
404 result = (int) check;
405 return result;
406 }
407
408 static int
409 decode_range(const char *source, int *lo, int *hi)
410 {
411 int result = 0;
412 char *after1;
413 char *after2;
414 if ((*lo = decode_one(source, &after1)) >= 0) {
415 after1 += strspn(after1, ":-.\t ");
416 if ((*hi = decode_one(after1, &after2)) < 0) {
417 *hi = *lo;
418 }
419 result = 1;
420 }
421 return result;
422 }
423
424 static void
425 do_range(const char *source)
426 {
427 int lo, hi;
428 if (decode_range(source, &lo, &hi)) {
429 if (opt_all) {
430 while (lo <= hi) {
431 int other_rc = CharacterClass(lo);
432 if (!opt_quiet)
433 printf("U+%04X\t%s\n", lo, class_name(other_rc));
434 ++lo;
435 }
436 } else if (opt_check) {
437 while (lo <= hi) {
438 int expect = expected_class(lo);
439 int actual = CharacterClass(lo);
440 if (actual != expect)
441 printf("U+%04X\t%s ->%s\n", lo,
442 class_name(expect),
443 class_name(actual));
444 ++lo;
445 }
446 } else {
447 printf("\"charClass\" resource for [%d..%d]:\n", lo, hi);
448 report_resource(lo, hi + 1);
449 }
450 }
451 }
452 #endif /* OPT_WIDE_CHARS */
453
454 /*
455 * TODO: add option to show do_range in hex
456 */
457 int
458 main(int argc, char **argv ENVP_ARG)
459 {
460 #if OPT_WIDE_CHARS
461 int ch;
462 #endif
463
464 (void) argc;
465 (void) argv;
466
467 #if OPT_WIDE_CHARS
468 setlocale(LC_ALL, "");
469 while ((ch = getopt(argc, argv, "acsv")) != -1) {
470 switch (ch) {
471 case 'a':
472 opt_all = 1;
473 break;
474 case 'c':
475 opt_check = 1;
476 break;
477 case 's':
478 opt_quiet = 1;
479 break;
480 case 'v':
481 opt_v = 1;
482 break;
483 default:
484 usage();
485 }
486 }
487 init_classtab();
488
489 if (optind >= argc) {
490 do_range("0-255");
491 } else {
492 while (optind < argc) {
493 do_range(argv[optind++]);
494 }
495 }
496 report_wide_char_class();
497 #else
498 printf("wide-character support is not configured\n");
499 #endif /* OPT_WIDE_CHARS */
500 return 0;
501 }
502 #endif /* TEST_DRIVER */