"Fossies" - the Fresh Open Source Software Archive 
Member "tin-2.6.2/src/charset.c" (9 Dec 2022, 12931 Bytes) of package /linux/misc/tin-2.6.2.tar.xz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "charset.c" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
2.6.1_vs_2.6.2.
1 /*
2 * Project : tin - a Usenet reader
3 * Module : charset.c
4 * Author : M. Kuhn, T. Burmester
5 * Created : 1993-12-10
6 * Updated : 2021-02-23
7 * Notes : ISO to ascii charset conversion routines
8 *
9 * Copyright (c) 1993-2023 Markus Kuhn <mgk25@cl.cam.ac.uk>
10 * All rights reserved.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 *
16 * 1. Redistributions of source code must retain the above copyright notice,
17 * this list of conditions and the following disclaimer.
18 *
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * 3. Neither the name of the copyright holder nor the names of its
24 * contributors may be used to endorse or promote products derived from
25 * this software without specific prior written permission.
26 *
27 * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
31 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40
41 #ifndef TIN_H
42 # include "tin.h"
43 #endif /* !TIN_H */
44
45 /*
46 * Table for the iso2asc conversion
47 * iso2asc by (unrza3@cd4680fs.rrze.uni-erlangen.de)
48 * included by (root@aspic.han.de)
49 */
50
51 #define SUB "?"
52 #define ISO_EXTRA 0xa0 /* beginning of second range of printable chars */
53
54 /*
55 * TABSTOP(x) is the column of the character after the TAB
56 * at column x. First column is 0, of course.
57 */
58
59 #define TABSTOP(x) (((x) - ((x)&7)) + 8)
60
61 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
62 {
63 /* universal table for many languages */
64 {
65 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
66 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
67 "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
68 "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
69 "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
70 "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
71 },
72 /* single-spacing universal table */
73 {
74 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
75 " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
76 "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
77 "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
78 "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
79 "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
80 },
81 /* table for Danish, Dutch, German, Norwegian and Swedish */
82 {
83 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
84 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
85 "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
86 "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
87 "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
88 "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
89 },
90 /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
91 {
92 " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
93 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
94 "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
95 "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
96 "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
97 "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
98 },
99 /* table with RFC1345 codes in brackets */
100 {
101 "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
102 "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
103 "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
104 "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
105 "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
106 "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
107 "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
108 "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
109 "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
110 "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
111 "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
112 "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
113 },
114 /* table for printers that allow overstriking with backspace */
115 {
116 " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
117 "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
118 " ","+\b_","2","3","'","u","P",".",
119 ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
120 "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
121 "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
122 "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
123 "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
124 "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
125 "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
126 "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
127 "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
128 },
129 /* table for IBM PC character set (code page 437) */
130 {
131 "\377","\255","\233","\234",SUB,"\235","|","\25",
132 "\"","(c)","\246","\256","\252","-","(R)","-",
133 "\370","\361","\375","3","'","\346","\24","\371",
134 ",","1","\247","\257","\254","\253"," 3/4","\250",
135 "A","A","A","A","\216","\217","\222","\200",
136 "E","\220","E","E","I","I","I","I",
137 "D","\245","O","O","O","O","\231","x",
138 "\355","U","U","U","\232","Y","T","\341",
139 "\205","\240","\203","a","\204","\206","\221","\207",
140 "\212","\202","\210","\211","\215","\241","\214","\213",
141 "d","\244","\225","\242","\223","o","\224","\366",
142 "\355","\227","\243","\226","\201","y","t","\230"
143 }
144 };
145
146 /*
147 * German tex style to latin1 conversion (by root@aspic, 12/04/93)
148 */
149
150 #define TEX_SUBST 16
151 #define SPACES " "
152
153 static const char *const tex_from[TEX_SUBST] =
154 {
155 "\"a", "\\\"a",
156 "\"o", "\\\"o",
157 "\"u", "\\\"u",
158 "\"A", "\\\"A",
159 "\"O", "\\\"O",
160 "\"U", "\\\"U",
161 "\"s", "\\\"s", "\\3",
162 NULL
163 };
164
165 /*
166 * Now the conversion function...
167 */
168
169 void
170 convert_iso2asc(
171 char *iso,
172 char **asc_buffer,
173 size_t *max_line_len,
174 int t)
175 {
176 constext *p;
177 constext *const *tab;
178 char *asc;
179 t_bool first; /* flag for first SPACE/TAB after other characters */
180 int i, a; /* column counters in iso and asc */
181
182 asc = *asc_buffer;
183
184 if (iso == NULL || asc == NULL)
185 return;
186
187 tab = iso2asc[t];
188 first = TRUE;
189 i = a = 0;
190 while (*iso != '\0') {
191 if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
192 p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
193 iso++;
194 i++;
195 first = TRUE;
196 while (*p) {
197 *(asc++) = *(p++);
198 if ((asc - *asc_buffer) >= (int) *max_line_len) {
199 int offset = (int) (asc - *asc_buffer);
200 *max_line_len += 64;
201 *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
202 asc = *asc_buffer + offset;
203 }
204 a++;
205 }
206 } else {
207 if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
208 /*
209 * spaces or TABS should be removed
210 */
211 if (*iso == ' ') {
212 /*
213 * only the first space after a letter must not be removed
214 */
215 if (first) {
216 *(asc++) = ' ';
217 a++;
218 first = FALSE;
219 }
220 i++;
221 } else { /* here: *iso == '\t' */
222 if (a >= TABSTOP(i)) {
223 /*
224 * remove TAB or replace it with SPACE if necessary
225 */
226 if (first) {
227 *(asc++) = ' ';
228 a++;
229 first = FALSE;
230 }
231 } else {
232 /*
233 * TAB will correct the column difference
234 */
235 *(asc++) = '\t'; /* = *iso */
236 a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
237 }
238 i = TABSTOP(i);
239 }
240 iso++;
241 } else {
242 /*
243 * just copy the characters and advance the column counters
244 */
245 if (*iso == '\t') {
246 a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
247 } else if (*iso == '\b') {
248 a--;
249 i--;
250 } else {
251 a++;
252 i++;
253 }
254 *(asc++) = *(iso++);
255 first = TRUE;
256 }
257 }
258 if ((asc - *asc_buffer) >= (int) *max_line_len) {
259 int offset = (int) (asc - *asc_buffer);
260 *max_line_len += 64;
261 *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
262 asc = *asc_buffer + offset;
263 }
264 }
265 *asc = '\0';
266 }
267
268
269 void
270 convert_tex2iso(
271 char *from,
272 char *to)
273 {
274 const char *tex_to[TEX_SUBST];
275 int i;
276 size_t spaces = 0; /* spaces to add */
277 size_t len, col = 0; /* length of from, col counter */
278 size_t subst_len;
279 t_bool ex;
280
281 /* initialize tex_to */
282 memset(tex_to, '\0', sizeof(tex_to));
283
284 /*
285 * Charsets which have German umlauts incl. sharp s at the same
286 * code position as ISO-8859-1
287 * DEC-MCS, Windows-1252
288 */
289 if (IS_LOCAL_CHARSET("ISO-8859-1") ||
290 IS_LOCAL_CHARSET("ISO-8859-2") ||
291 IS_LOCAL_CHARSET("ISO-8859-3") ||
292 IS_LOCAL_CHARSET("ISO-8859-4") ||
293 IS_LOCAL_CHARSET("ISO-8859-9") ||
294 IS_LOCAL_CHARSET("ISO-8859-10") ||
295 IS_LOCAL_CHARSET("ISO-8859-13") ||
296 IS_LOCAL_CHARSET("ISO-8859-14") ||
297 IS_LOCAL_CHARSET("ISO-8859-15") ||
298 IS_LOCAL_CHARSET("ISO-8859-16") ||
299 iso2asc_supported >= 0) {
300 tex_to[1] = tex_to[0] = "\344"; /* auml */
301 tex_to[3] = tex_to[2] = "\366"; /* ouml */
302 tex_to[5] = tex_to[4] = "\374"; /* uuml */
303 tex_to[7] = tex_to[6] = "\304"; /* Auml */
304 tex_to[9] = tex_to[8] = "\326"; /* Ouml */
305 tex_to[11] = tex_to[10] = "\334"; /* Uuml */
306 tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
307 } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
308 tex_to[1] = tex_to[0] = "\303\244"; /* auml */
309 tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
310 tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
311 tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
312 tex_to[9] = tex_to[8] = "\303\226"; /* Ouml */
313 tex_to[11] = tex_to[10] = "\303\234"; /* Uuml */
314 tex_to[14] = tex_to[13] = tex_to[12] = "\303\237"; /* szlig */
315 } else {
316 strcpy(to, from);
317 return;
318 }
319
320 *to = '\0';
321 len = strlen(from);
322
323 while (col < len) {
324 i = 0;
325 ex = FALSE;
326 while ((i < TEX_SUBST - 1) && !ex) {
327 subst_len = strlen(tex_from[i]);
328 if (!strncmp(from + col, tex_from[i], subst_len)) {
329 strcat(to, tex_to[i]);
330 spaces += subst_len - strlen(tex_to[i]);
331 col += subst_len - 1;
332 ex = TRUE;
333 }
334 i++;
335 }
336 if (!ex)
337 strncat(to, from + col, 1);
338 if (from[col] == ' ') {
339 strncat(to, SPACES, spaces);
340 spaces = 0;
341 }
342
343 col++;
344 }
345 }
346
347
348 /*
349 * Check for German TeX encoding in file open on fp
350 */
351 t_bool
352 is_art_tex_encoded(
353 FILE *fp)
354 {
355 char line[LEN];
356 int i, len;
357 t_bool body = FALSE;
358
359 rewind(fp);
360
361 while (fgets(line, (int) sizeof(line), fp) != NULL) {
362 if (line[0] == '\n' && !body)
363 body = TRUE;
364 else if (!body)
365 continue;
366
367 i = 0;
368
369 while (line[i++] == ' ')
370 ; /* search for first non blank */
371
372 i--;
373
374 if (!isalnum((unsigned char) line[i]) && line[i] != '\"')
375 continue; /* quoting char */
376
377 len = (int) strlen(line) - 1;
378 for (i = 1; i < len; i++) {
379 if (((line[i] == '\\') || (line[i] == '\"')) &&
380 (isalnum((unsigned char) line[i - 1])) &&
381 (isalnum((unsigned char) line[i + 1])))
382 return TRUE;
383 }
384 }
385
386 return FALSE;
387 }
388
389
390 /*
391 * Replace all non printable characters by '?'
392 */
393 char *
394 convert_to_printable(
395 char *buf,
396 t_bool keep_tab)
397 {
398 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
399 char *buffer;
400 wchar_t *wbuffer;
401 size_t len = strlen(buf) + 1;
402
403 if (IS_LOCAL_CHARSET("UTF-8"))
404 utf8_valid(buf);
405
406 if ((wbuffer = char2wchar_t(buf)) != NULL) {
407 wconvert_to_printable(wbuffer, keep_tab);
408
409 if ((buffer = wchar_t2char(wbuffer)) != NULL) {
410 strncpy(buf, buffer, len);
411 buf[len - 1] = '\0';
412
413 free(buffer);
414 }
415 free(wbuffer);
416 }
417 #else
418 unsigned char *c;
419
420 for (c = (unsigned char *) buf; *c; c++) {
421 if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
422 *c = '?';
423 }
424 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
425 return buf;
426 }
427
428
429 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
430 /*
431 * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
432 * sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
433 * and thus loop termination fails.
434 */
435 wchar_t *
436 wconvert_to_printable(
437 wchar_t *wbuf,
438 t_bool keep_tab)
439 {
440 wchar_t *wc;
441
442 for (wc = wbuf; *wc; wc++) {
443 if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
444 *wc = (wchar_t) '?';
445 }
446
447 return wbuf;
448 }
449 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */