tin  2.6.1
About: TIN is a threaded NNTP and spool based UseNet newsreader.
  Fossies Dox: tin-2.6.1.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

charset.c
Go to the documentation of this file.
1/*
2 * Project : tin - a Usenet reader
3 * Module : charset.c
4 * Author : M. Kuhn, T. Burmester
5 * Created : 1993-12-10
6 * Updated : 2021-02-23
7 * Notes : ISO to ascii charset conversion routines
8 *
9 * Copyright (c) 1993-2022 Markus Kuhn <mgk25@cl.cam.ac.uk>
10 * All rights reserved.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 *
16 * 1. Redistributions of source code must retain the above copyright notice,
17 * this list of conditions and the following disclaimer.
18 *
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * 3. Neither the name of the copyright holder nor the names of its
24 * contributors may be used to endorse or promote products derived from
25 * this software without specific prior written permission.
26 *
27 * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
31 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40
41#ifndef TIN_H
42# include "tin.h"
43#endif /* !TIN_H */
44
45/*
46 * Table for the iso2asc conversion
47 * iso2asc by (unrza3@cd4680fs.rrze.uni-erlangen.de)
48 * included by (root@aspic.han.de)
49 */
50
51#define SUB "?"
52#define ISO_EXTRA 0xa0 /* beginning of second range of printable chars */
53
54/*
55 * TABSTOP(x) is the column of the character after the TAB
56 * at column x. First column is 0, of course.
57 */
58
59#define TABSTOP(x) (((x) - ((x)&7)) + 8)
60
62{
63 /* universal table for many languages */
64 {
65 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
66 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
67 "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
68 "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
69 "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
70 "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
71 },
72 /* single-spacing universal table */
73 {
74 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
75 " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
76 "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
77 "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
78 "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
79 "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
80 },
81 /* table for Danish, Dutch, German, Norwegian and Swedish */
82 {
83 " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
84 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
85 "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
86 "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
87 "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
88 "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
89 },
90 /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
91 {
92 " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
93 " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
94 "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
95 "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
96 "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
97 "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
98 },
99 /* table with RFC1345 codes in brackets */
100 {
101 "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
102 "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
103 "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
104 "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
105 "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
106 "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
107 "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
108 "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
109 "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
110 "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
111 "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
112 "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
113 },
114 /* table for printers that allow overstriking with backspace */
115 {
116 " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
117 "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
118 " ","+\b_","2","3","'","u","P",".",
119 ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
120 "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
121 "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
122 "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
123 "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
124 "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
125 "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
126 "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
127 "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
128 },
129 /* table for IBM PC character set (code page 437) */
130 {
131 "\377","\255","\233","\234",SUB,"\235","|","\25",
132 "\"","(c)","\246","\256","\252","-","(R)","-",
133 "\370","\361","\375","3","'","\346","\24","\371",
134 ",","1","\247","\257","\254","\253"," 3/4","\250",
135 "A","A","A","A","\216","\217","\222","\200",
136 "E","\220","E","E","I","I","I","I",
137 "D","\245","O","O","O","O","\231","x",
138 "\355","U","U","U","\232","Y","T","\341",
139 "\205","\240","\203","a","\204","\206","\221","\207",
140 "\212","\202","\210","\211","\215","\241","\214","\213",
141 "d","\244","\225","\242","\223","o","\224","\366",
142 "\355","\227","\243","\226","\201","y","t","\230"
143 }
144};
145
146/*
147 * German tex style to latin1 conversion (by root@aspic, 12/04/93)
148 */
149
150#define TEX_SUBST 16
151#define SPACES " "
152
153static const char *const tex_from[TEX_SUBST] =
154{
155 "\"a", "\\\"a",
156 "\"o", "\\\"o",
157 "\"u", "\\\"u",
158 "\"A", "\\\"A",
159 "\"O", "\\\"O",
160 "\"U", "\\\"U",
161 "\"s", "\\\"s", "\\3",
162 NULL
163};
164
165/*
166 * Now the conversion function...
167 */
168
169void
171 char *iso,
172 char **asc_buffer,
173 size_t *max_line_len,
174 int t)
175{
176 constext *p;
177 constext *const *tab;
178 char *asc;
179 t_bool first; /* flag for first SPACE/TAB after other characters */
180 int i, a; /* column counters in iso and asc */
181
182 asc = *asc_buffer;
183
184 if (iso == NULL || asc == NULL)
185 return;
186
187 tab = iso2asc[t];
188 first = TRUE;
189 i = a = 0;
190 while (*iso != '\0') {
191 if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
192 p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
193 iso++;
194 i++;
195 first = TRUE;
196 while (*p) {
197 *(asc++) = *(p++);
198 if ((asc - *asc_buffer) >= (int) *max_line_len) {
199 int offset = (int) (asc - *asc_buffer);
200 *max_line_len += 64;
201 *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
202 asc = *asc_buffer + offset;
203 }
204 a++;
205 }
206 } else {
207 if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
208 /*
209 * spaces or TABS should be removed
210 */
211 if (*iso == ' ') {
212 /*
213 * only the first space after a letter must not be removed
214 */
215 if (first) {
216 *(asc++) = ' ';
217 a++;
218 first = FALSE;
219 }
220 i++;
221 } else { /* here: *iso == '\t' */
222 if (a >= TABSTOP(i)) {
223 /*
224 * remove TAB or replace it with SPACE if necessary
225 */
226 if (first) {
227 *(asc++) = ' ';
228 a++;
229 first = FALSE;
230 }
231 } else {
232 /*
233 * TAB will correct the column difference
234 */
235 *(asc++) = '\t'; /* = *iso */
236 a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
237 }
238 i = TABSTOP(i);
239 }
240 iso++;
241 } else {
242 /*
243 * just copy the characters and advance the column counters
244 */
245 if (*iso == '\t') {
246 a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
247 } else if (*iso == '\b') {
248 a--;
249 i--;
250 } else {
251 a++;
252 i++;
253 }
254 *(asc++) = *(iso++);
255 first = TRUE;
256 }
257 }
258 if ((asc - *asc_buffer) >= (int) *max_line_len) {
259 int offset = (int) (asc - *asc_buffer);
260 *max_line_len += 64;
261 *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
262 asc = *asc_buffer + offset;
263 }
264 }
265 *asc = '\0';
266}
267
268
269void
271 char *from,
272 char *to)
273{
274 const char *tex_to[TEX_SUBST];
275 int i;
276 size_t spaces = 0; /* spaces to add */
277 size_t len, col = 0; /* length of from, col counter */
278 size_t subst_len;
279 t_bool ex;
280
281 /* initialize tex_to */
282 memset(tex_to, '\0', sizeof(tex_to));
283
284 /*
285 * Charsets which have German umlauts incl. sharp s at the same
286 * code position as ISO-8859-1
287 * DEC-MCS, Windows-1252
288 */
289 if (IS_LOCAL_CHARSET("ISO-8859-1") ||
290 IS_LOCAL_CHARSET("ISO-8859-2") ||
291 IS_LOCAL_CHARSET("ISO-8859-3") ||
292 IS_LOCAL_CHARSET("ISO-8859-4") ||
293 IS_LOCAL_CHARSET("ISO-8859-9") ||
294 IS_LOCAL_CHARSET("ISO-8859-10") ||
295 IS_LOCAL_CHARSET("ISO-8859-13") ||
296 IS_LOCAL_CHARSET("ISO-8859-14") ||
297 IS_LOCAL_CHARSET("ISO-8859-15") ||
298 IS_LOCAL_CHARSET("ISO-8859-16") ||
299 iso2asc_supported >= 0) {
300 tex_to[1] = tex_to[0] = "\344"; /* auml */
301 tex_to[3] = tex_to[2] = "\366"; /* ouml */
302 tex_to[5] = tex_to[4] = "\374"; /* uuml */
303 tex_to[7] = tex_to[6] = "\304"; /* Auml */
304 tex_to[9] = tex_to[8] = "\326"; /* Ouml */
305 tex_to[11] = tex_to[10] = "\334"; /* Uuml */
306 tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
307 } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
308 tex_to[1] = tex_to[0] = "\303\244"; /* auml */
309 tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
310 tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
311 tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
312 tex_to[9] = tex_to[8] = "\303\226"; /* Ouml */
313 tex_to[11] = tex_to[10] = "\303\234"; /* Uuml */
314 tex_to[14] = tex_to[13] = tex_to[12] = "\303\237"; /* szlig */
315 } else {
316 strcpy(to, from);
317 return;
318 }
319
320 *to = '\0';
321 len = strlen(from);
322
323 while (col < len) {
324 i = 0;
325 ex = FALSE;
326 while ((i < TEX_SUBST - 1) && !ex) {
327 subst_len = strlen(tex_from[i]);
328 if (!strncmp(from + col, tex_from[i], subst_len)) {
329 strcat(to, tex_to[i]);
330 spaces += subst_len - strlen(tex_to[i]);
331 col += subst_len - 1;
332 ex = TRUE;
333 }
334 i++;
335 }
336 if (!ex)
337 strncat(to, from + col, 1);
338 if (from[col] == ' ') {
339 strncat(to, SPACES, spaces);
340 spaces = 0;
341 }
342
343 col++;
344 }
345}
346
347
348/*
349 * Check for German TeX encoding in file open on fp
350 */
351t_bool
353 FILE *fp)
354{
355 char line[LEN];
356 int i, len;
357 t_bool body = FALSE;
358
359 rewind(fp);
360
361 while (fgets(line, (int) sizeof(line), fp) != NULL) {
362 if (line[0] == '\n' && !body)
363 body = TRUE;
364 else if (!body)
365 continue;
366
367 i = 0;
368
369 while (line[i++] == ' ')
370 ; /* search for first non blank */
371
372 i--;
373
374 if (!isalnum((unsigned char) line[i]) && line[i] != '\"')
375 continue; /* quoting char */
376
377 len = (int) strlen(line) - 1;
378 for (i = 1; i < len; i++) {
379 if (((line[i] == '\\') || (line[i] == '\"')) &&
380 (isalnum((unsigned char) line[i - 1])) &&
381 (isalnum((unsigned char) line[i + 1])))
382 return TRUE;
383 }
384 }
385
386 return FALSE;
387}
388
389
390/*
391 * Replace all non printable characters by '?'
392 */
393char *
395 char *buf,
396 t_bool keep_tab)
397{
398#if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
399 char *buffer;
400 wchar_t *wbuffer;
401 size_t len = strlen(buf) + 1;
402
403 if (IS_LOCAL_CHARSET("UTF-8"))
404 utf8_valid(buf);
405
406 if ((wbuffer = char2wchar_t(buf)) != NULL) {
407 wconvert_to_printable(wbuffer, keep_tab);
408
409 if ((buffer = wchar_t2char(wbuffer)) != NULL) {
410 strncpy(buf, buffer, len);
411 buf[len - 1] = '\0';
412
413 free(buffer);
414 }
415 free(wbuffer);
416 }
417#else
418 unsigned char *c;
419
420 for (c = (unsigned char *) buf; *c; c++) {
421 if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
422 *c = '?';
423 }
424#endif /* MULTIBYTE_ABLE && !NO_LOCALE */
425 return buf;
426}
427
428
429#if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
430/*
431 * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
432 * sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
433 * and thus loop termination fails.
434 */
435wchar_t *
436wconvert_to_printable(
437 wchar_t *wbuf,
438 t_bool keep_tab)
439{
440 wchar_t *wc;
441
442 for (wc = wbuf; *wc; wc++) {
443 if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
444 *wc = (wchar_t) '?';
445 }
446
447 return wbuf;
448}
449#endif /* MULTIBYTE_ABLE && !NO_LOCALE */
unsigned t_bool
Definition: bool.h:77
#define TRUE
Definition: bool.h:74
#define FALSE
Definition: bool.h:70
t_bool is_art_tex_encoded(FILE *fp)
Definition: charset.c:352
#define SPACES
Definition: charset.c:151
void convert_iso2asc(char *iso, char **asc_buffer, size_t *max_line_len, int t)
Definition: charset.c:170
static const char *const tex_from[16]
Definition: charset.c:153
#define SUB
Definition: charset.c:51
#define ISO_EXTRA
Definition: charset.c:52
#define TEX_SUBST
Definition: charset.c:150
char * convert_to_printable(char *buf, t_bool keep_tab)
Definition: charset.c:394
static constext *const iso2asc[7][256- 0xa0]
Definition: charset.c:61
#define TABSTOP(x)
Definition: charset.c:59
void convert_tex2iso(char *from, char *to)
Definition: charset.c:270
int iso2asc_supported
Definition: init.c:121
static char buf[16]
Definition: langinfo.c:50
int my_isprint(int c)
Definition: misc.c:978
static int offset
Definition: read.c:62
#define LEN
Definition: tin.h:860
#define NUM_ISO_TABLES
Definition: tin.h:998
#define IS_LOCAL_CHARSET(c)
Definition: tin.h:782
const char constext
Definition: tin.h:2018
#define my_realloc(ptr, size)
Definition: tin.h:2247
#define EIGHT_BIT(ptr)
Definition: tin.h:2267