"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.6.2/src/charset.c" (9 Dec 2022, 12931 Bytes) of package /linux/misc/tin-2.6.2.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.6.1_vs_2.6.2.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : charset.c
    4  *  Author    : M. Kuhn, T. Burmester
    5  *  Created   : 1993-12-10
    6  *  Updated   : 2021-02-23
    7  *  Notes     : ISO to ascii charset conversion routines
    8  *
    9  * Copyright (c) 1993-2023 Markus Kuhn <mgk25@cl.cam.ac.uk>
   10  * All rights reserved.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  *
   16  * 1. Redistributions of source code must retain the above copyright notice,
   17  *    this list of conditions and the following disclaimer.
   18  *
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  *
   23  * 3. Neither the name of the copyright holder nor the names of its
   24  *    contributors may be used to endorse or promote products derived from
   25  *    this software without specific prior written permission.
   26  *
   27  * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   28  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
   31  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37  * POSSIBILITY OF SUCH DAMAGE.
   38  */
   39 
   40 
   41 #ifndef TIN_H
   42 #   include "tin.h"
   43 #endif /* !TIN_H */
   44 
   45 /*
   46  *  Table for the iso2asc conversion
   47  *  iso2asc  by  (unrza3@cd4680fs.rrze.uni-erlangen.de)
   48  *  included by  (root@aspic.han.de)
   49  */
   50 
   51 #define SUB "?"
   52 #define ISO_EXTRA   0xa0 /* beginning of second range of printable chars */
   53 
   54 /*
   55  * TABSTOP(x) is the column of the character after the TAB
   56  * at column x. First column is 0, of course.
   57  */
   58 
   59 #define TABSTOP(x)  (((x) - ((x)&7)) + 8)
   60 
   61 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
   62 {
   63     /* universal table for many languages */
   64     {
   65     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   66     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   67     "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
   68     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
   69     "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
   70     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
   71     },
   72     /* single-spacing universal table */
   73     {
   74     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
   75     " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
   76     "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
   77     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
   78     "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
   79     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
   80     },
   81     /* table for Danish, Dutch, German, Norwegian and Swedish */
   82     {
   83     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   84     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   85     "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
   86     "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
   87     "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
   88     "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
   89     },
   90     /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
   91     {
   92     " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   93     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   94     "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
   95     "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
   96     "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
   97     "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
   98     },
   99     /* table with RFC1345 codes in brackets */
  100     {
  101     "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
  102     "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
  103     "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
  104     "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
  105     "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
  106     "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
  107     "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
  108     "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
  109     "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
  110     "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
  111     "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
  112     "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
  113     },
  114     /* table for printers that allow overstriking with backspace */
  115     {
  116     " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
  117     "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
  118     " ","+\b_","2","3","'","u","P",".",
  119     ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
  120     "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
  121     "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
  122     "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
  123     "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
  124     "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
  125     "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
  126     "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
  127     "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
  128     },
  129     /* table for IBM PC character set (code page 437) */
  130     {
  131     "\377","\255","\233","\234",SUB,"\235","|","\25",
  132     "\"","(c)","\246","\256","\252","-","(R)","-",
  133     "\370","\361","\375","3","'","\346","\24","\371",
  134     ",","1","\247","\257","\254","\253"," 3/4","\250",
  135     "A","A","A","A","\216","\217","\222","\200",
  136     "E","\220","E","E","I","I","I","I",
  137     "D","\245","O","O","O","O","\231","x",
  138     "\355","U","U","U","\232","Y","T","\341",
  139     "\205","\240","\203","a","\204","\206","\221","\207",
  140     "\212","\202","\210","\211","\215","\241","\214","\213",
  141     "d","\244","\225","\242","\223","o","\224","\366",
  142     "\355","\227","\243","\226","\201","y","t","\230"
  143     }
  144 };
  145 
  146 /*
  147  * German tex style to latin1 conversion (by root@aspic, 12/04/93)
  148  */
  149 
  150 #define TEX_SUBST   16
  151 #define SPACES      "                                                                                                         "
  152 
  153 static const char *const tex_from[TEX_SUBST] =
  154 {
  155     "\"a", "\\\"a",
  156     "\"o", "\\\"o",
  157     "\"u", "\\\"u",
  158     "\"A", "\\\"A",
  159     "\"O", "\\\"O",
  160     "\"U", "\\\"U",
  161     "\"s", "\\\"s", "\\3",
  162     NULL
  163 };
  164 
  165 /*
  166  *  Now the conversion function...
  167  */
  168 
  169 void
  170 convert_iso2asc(
  171     char *iso,
  172     char **asc_buffer,
  173     size_t *max_line_len,
  174     int t)
  175 {
  176     constext *p;
  177     constext *const *tab;
  178     char *asc;
  179     t_bool first;   /* flag for first SPACE/TAB after other characters */
  180     int i, a;   /* column counters in iso and asc */
  181 
  182     asc = *asc_buffer;
  183 
  184     if (iso == NULL || asc == NULL)
  185         return;
  186 
  187     tab = iso2asc[t];
  188     first = TRUE;
  189     i = a = 0;
  190     while (*iso != '\0') {
  191         if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
  192             p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
  193             iso++;
  194             i++;
  195             first = TRUE;
  196             while (*p) {
  197                 *(asc++) = *(p++);
  198                 if ((asc - *asc_buffer) >= (int) *max_line_len) {
  199                     int offset = (int) (asc - *asc_buffer);
  200                     *max_line_len += 64;
  201                     *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  202                     asc = *asc_buffer + offset;
  203                 }
  204                 a++;
  205             }
  206         } else {
  207             if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
  208                 /*
  209                  * spaces or TABS should be removed
  210                  */
  211                 if (*iso == ' ') {
  212                     /*
  213                      * only the first space after a letter must not be removed
  214                      */
  215                     if (first) {
  216                         *(asc++) = ' ';
  217                         a++;
  218                         first = FALSE;
  219                     }
  220                     i++;
  221                 } else {    /* here: *iso == '\t' */
  222                     if (a >= TABSTOP(i)) {
  223                         /*
  224                          * remove TAB or replace it with SPACE if necessary
  225                          */
  226                         if (first) {
  227                             *(asc++) = ' ';
  228                             a++;
  229                             first = FALSE;
  230                         }
  231                     } else {
  232                         /*
  233                          * TAB will correct the column difference
  234                          */
  235                         *(asc++) = '\t';    /* = *iso */
  236                         a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
  237                     }
  238                     i = TABSTOP(i);
  239                 }
  240                 iso++;
  241             } else {
  242                 /*
  243                  * just copy the characters and advance the column counters
  244                  */
  245                 if (*iso == '\t') {
  246                     a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
  247                 } else if (*iso == '\b') {
  248                     a--;
  249                     i--;
  250                 } else {
  251                     a++;
  252                     i++;
  253                 }
  254                 *(asc++) = *(iso++);
  255                 first = TRUE;
  256             }
  257         }
  258         if ((asc - *asc_buffer) >= (int) *max_line_len) {
  259             int offset = (int) (asc - *asc_buffer);
  260             *max_line_len += 64;
  261             *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  262             asc = *asc_buffer + offset;
  263         }
  264     }
  265     *asc = '\0';
  266 }
  267 
  268 
  269 void
  270 convert_tex2iso(
  271     char *from,
  272     char *to)
  273 {
  274     const char *tex_to[TEX_SUBST];
  275     int i;
  276     size_t spaces = 0; /* spaces to add */
  277     size_t len, col = 0;    /* length of from, col counter */
  278     size_t subst_len;
  279     t_bool ex;
  280 
  281     /* initialize tex_to */
  282     memset(tex_to, '\0', sizeof(tex_to));
  283 
  284     /*
  285      * Charsets which have German umlauts incl. sharp s at the same
  286      * code position as ISO-8859-1
  287      * DEC-MCS, Windows-1252
  288      */
  289     if (IS_LOCAL_CHARSET("ISO-8859-1") ||
  290         IS_LOCAL_CHARSET("ISO-8859-2") ||
  291         IS_LOCAL_CHARSET("ISO-8859-3") ||
  292         IS_LOCAL_CHARSET("ISO-8859-4") ||
  293         IS_LOCAL_CHARSET("ISO-8859-9") ||
  294         IS_LOCAL_CHARSET("ISO-8859-10") ||
  295         IS_LOCAL_CHARSET("ISO-8859-13") ||
  296         IS_LOCAL_CHARSET("ISO-8859-14") ||
  297         IS_LOCAL_CHARSET("ISO-8859-15") ||
  298         IS_LOCAL_CHARSET("ISO-8859-16") ||
  299         iso2asc_supported >= 0) {
  300         tex_to[1] = tex_to[0] = "\344"; /* auml */
  301         tex_to[3] = tex_to[2] = "\366"; /* ouml */
  302         tex_to[5] = tex_to[4] = "\374"; /* uuml */
  303         tex_to[7] = tex_to[6] = "\304"; /* Auml */
  304         tex_to[9] = tex_to[8] = "\326"; /* Ouml */
  305         tex_to[11] = tex_to[10] = "\334";   /* Uuml */
  306         tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
  307     } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
  308         tex_to[1] = tex_to[0] = "\303\244"; /* auml */
  309         tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
  310         tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
  311         tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
  312         tex_to[9] = tex_to[8] = "\303\226"; /* Ouml */
  313         tex_to[11] = tex_to[10] = "\303\234";   /* Uuml */
  314         tex_to[14] = tex_to[13] = tex_to[12] = "\303\237";  /* szlig */
  315     } else {
  316         strcpy(to, from);
  317         return;
  318     }
  319 
  320     *to = '\0';
  321     len = strlen(from);
  322 
  323     while (col < len) {
  324         i = 0;
  325         ex = FALSE;
  326         while ((i < TEX_SUBST - 1) && !ex) {
  327             subst_len = strlen(tex_from[i]);
  328             if (!strncmp(from + col, tex_from[i], subst_len)) {
  329                 strcat(to, tex_to[i]);
  330                 spaces += subst_len - strlen(tex_to[i]);
  331                 col += subst_len - 1;
  332                 ex = TRUE;
  333             }
  334             i++;
  335         }
  336         if (!ex)
  337             strncat(to, from + col, 1);
  338         if (from[col] == ' ') {
  339             strncat(to, SPACES, spaces);
  340             spaces = 0;
  341         }
  342 
  343         col++;
  344     }
  345 }
  346 
  347 
  348 /*
  349  * Check for German TeX encoding in file open on fp
  350  */
  351 t_bool
  352 is_art_tex_encoded(
  353     FILE *fp)
  354 {
  355     char line[LEN];
  356     int i, len;
  357     t_bool body = FALSE;
  358 
  359     rewind(fp);
  360 
  361     while (fgets(line, (int) sizeof(line), fp) != NULL) {
  362         if (line[0] == '\n' && !body)
  363             body = TRUE;
  364         else if (!body)
  365             continue;
  366 
  367         i = 0;
  368 
  369         while (line[i++] == ' ')
  370             ;   /* search for first non blank */
  371 
  372         i--;
  373 
  374         if (!isalnum((unsigned char) line[i]) && line[i] != '\"')
  375             continue;   /* quoting char */
  376 
  377         len = (int) strlen(line) - 1;
  378         for (i = 1; i < len; i++) {
  379             if (((line[i] == '\\') || (line[i] == '\"')) &&
  380                             (isalnum((unsigned char) line[i - 1])) &&
  381                             (isalnum((unsigned char) line[i + 1])))
  382                 return TRUE;
  383         }
  384     }
  385 
  386     return FALSE;
  387 }
  388 
  389 
  390 /*
  391  * Replace all non printable characters by '?'
  392  */
  393 char *
  394 convert_to_printable(
  395     char *buf,
  396     t_bool keep_tab)
  397 {
  398 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  399     char *buffer;
  400     wchar_t *wbuffer;
  401     size_t len = strlen(buf) + 1;
  402 
  403     if (IS_LOCAL_CHARSET("UTF-8"))
  404         utf8_valid(buf);
  405 
  406     if ((wbuffer = char2wchar_t(buf)) != NULL) {
  407         wconvert_to_printable(wbuffer, keep_tab);
  408 
  409         if ((buffer = wchar_t2char(wbuffer)) != NULL) {
  410             strncpy(buf, buffer, len);
  411             buf[len - 1] = '\0';
  412 
  413             free(buffer);
  414         }
  415         free(wbuffer);
  416     }
  417 #else
  418     unsigned char *c;
  419 
  420     for (c = (unsigned char *) buf; *c; c++) {
  421         if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
  422             *c = '?';
  423     }
  424 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
  425     return buf;
  426 }
  427 
  428 
  429 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  430 /*
  431  * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
  432  *        sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
  433  *        and thus loop termination fails.
  434  */
  435 wchar_t *
  436 wconvert_to_printable(
  437     wchar_t *wbuf,
  438     t_bool keep_tab)
  439 {
  440     wchar_t *wc;
  441 
  442     for (wc = wbuf; *wc; wc++) {
  443         if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
  444             *wc = (wchar_t) '?';
  445     }
  446 
  447     return wbuf;
  448 }
  449 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */