"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.5/src/charset.c" (1 Dec 2020, 12887 Bytes) of package /linux/misc/tin-2.4.5.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.4_vs_2.4.5.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : charset.c
    4  *  Author    : M. Kuhn, T. Burmester
    5  *  Created   : 1993-12-10
    6  *  Updated   : 2020-02-26
    7  *  Notes     : ISO to ascii charset conversion routines
    8  *
    9  * Copyright (c) 1993-2021 Markus Kuhn <mgk25@cl.cam.ac.uk>
   10  * All rights reserved.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  *
   16  * 1. Redistributions of source code must retain the above copyright notice,
   17  *    this list of conditions and the following disclaimer.
   18  *
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  *
   23  * 3. Neither the name of the copyright holder nor the names of its
   24  *    contributors may be used to endorse or promote products derived from
   25  *    this software without specific prior written permission.
   26  *
   27  * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   28  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
   31  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37  * POSSIBILITY OF SUCH DAMAGE.
   38  */
   39 
   40 
   41 #ifndef TIN_H
   42 #   include "tin.h"
   43 #endif /* !TIN_H */
   44 
   45 /*
   46  *  Table for the iso2asc conversion
   47  *  iso2asc  by  (unrza3@cd4680fs.rrze.uni-erlangen.de)
   48  *  included by  (root@aspic.han.de)
   49  */
   50 
   51 #define SUB "?"
   52 #define ISO_EXTRA   0xa0 /* beginning of second range of printable chars */
   53 
   54 /*
   55  * TABSTOP(x) is the column of the character after the TAB
   56  * at column x. First column is 0, of course.
   57  */
   58 
   59 #define TABSTOP(x)  (((x) - ((x)&7)) + 8)
   60 
   61 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
   62 {
   63     /* universal table for many languages */
   64     {
   65     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   66     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   67     "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
   68     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
   69     "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
   70     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
   71     },
   72     /* single-spacing universal table */
   73     {
   74     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
   75     " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
   76     "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
   77     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
   78     "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
   79     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
   80     },
   81     /* table for Danish, Dutch, German, Norwegian and Swedish */
   82     {
   83     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   84     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   85     "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
   86     "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
   87     "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
   88     "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
   89     },
   90     /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
   91     {
   92     " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   93     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   94     "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
   95     "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
   96     "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
   97     "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
   98     },
   99     /* table with RFC1345 codes in brackets */
  100     {
  101     "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
  102     "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
  103     "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
  104     "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
  105     "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
  106     "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
  107     "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
  108     "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
  109     "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
  110     "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
  111     "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
  112     "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
  113     },
  114     /* table for printers that allow overstriking with backspace */
  115     {
  116     " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
  117     "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
  118     " ","+\b_","2","3","'","u","P",".",
  119     ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
  120     "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
  121     "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
  122     "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
  123     "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
  124     "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
  125     "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
  126     "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
  127     "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
  128     },
  129     /* table for IBM PC character set (code page 437) */
  130     {
  131     "\377","\255","\233","\234",SUB,"\235","|","\25",
  132     "\"","(c)","\246","\256","\252","-","(R)","-",
  133     "\370","\361","\375","3","'","\346","\24","\371",
  134     ",","1","\247","\257","\254","\253"," 3/4","\250",
  135     "A","A","A","A","\216","\217","\222","\200",
  136     "E","\220","E","E","I","I","I","I",
  137     "D","\245","O","O","O","O","\231","x",
  138     "\355","U","U","U","\232","Y","T","\341",
  139     "\205","\240","\203","a","\204","\206","\221","\207",
  140     "\212","\202","\210","\211","\215","\241","\214","\213",
  141     "d","\244","\225","\242","\223","o","\224","\366",
  142     "\355","\227","\243","\226","\201","y","t","\230"
  143     }
  144 };
  145 
  146 /*
  147  * german tex style to latin1 conversion (by root@aspic, 12/04/93)
  148  */
  149 
  150 #define TEX_SUBST   16
  151 #define SPACES      "                                                                                                         "
  152 
  153 static const char *const tex_from[TEX_SUBST] =
  154 {
  155     "\"a","\\\"a","\"o","\\\"o","\"u","\\\"u","\"A","\\\"A","\"O","\\\"O","\"U","\\\"U","\"s","\\\"s","\\3",'\0'
  156 };
  157 
  158 /*
  159  *  Now the conversion function...
  160  */
  161 
  162 void
  163 convert_iso2asc(
  164     char *iso,
  165     char **asc_buffer,
  166     size_t *max_line_len,
  167     int t)
  168 {
  169     constext *p;
  170     constext *const *tab;
  171     char *asc;
  172     t_bool first;   /* flag for first SPACE/TAB after other characters */
  173     int i, a;   /* column counters in iso and asc */
  174 
  175     asc = *asc_buffer;
  176 
  177     if (iso == NULL || asc == NULL)
  178         return;
  179 
  180     tab = iso2asc[t];
  181     first = TRUE;
  182     i = a = 0;
  183     while (*iso != '\0') {
  184         if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
  185             p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
  186             iso++;
  187             i++;
  188             first = TRUE;
  189             while (*p) {
  190                 *(asc++) = *(p++);
  191                 if ((asc - *asc_buffer) >= (int) *max_line_len) {
  192                     int offset = asc - *asc_buffer;
  193                     *max_line_len += 64;
  194                     *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  195                     asc = *asc_buffer + offset;
  196                 }
  197                 a++;
  198             }
  199         } else {
  200             if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
  201                 /*
  202                  * spaces or TABS should be removed
  203                  */
  204                 if (*iso == ' ') {
  205                     /*
  206                      * only the first space after a letter must not be removed
  207                      */
  208                     if (first) {
  209                         *(asc++) = ' ';
  210                         a++;
  211                         first = FALSE;
  212                     }
  213                     i++;
  214                 } else {    /* here: *iso == '\t' */
  215                     if (a >= TABSTOP(i)) {
  216                         /*
  217                          * remove TAB or replace it with SPACE if necessary
  218                          */
  219                         if (first) {
  220                             *(asc++) = ' ';
  221                             a++;
  222                             first = FALSE;
  223                         }
  224                     } else {
  225                         /*
  226                          * TAB will correct the column difference
  227                          */
  228                         *(asc++) = '\t';    /* = *iso */
  229                         a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
  230                     }
  231                     i = TABSTOP(i);
  232                 }
  233                 iso++;
  234             } else {
  235                 /*
  236                  * just copy the characters and advance the column counters
  237                  */
  238                 if (*iso == '\t') {
  239                     a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
  240                 } else if (*iso == '\b') {
  241                     a--;
  242                     i--;
  243                 } else {
  244                     a++;
  245                     i++;
  246                 }
  247                 *(asc++) = *(iso++);
  248                 first = TRUE;
  249             }
  250         }
  251         if ((asc - *asc_buffer) >= (int) *max_line_len) {
  252             int offset = asc - *asc_buffer;
  253             *max_line_len += 64;
  254             *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  255             asc = *asc_buffer + offset;
  256         }
  257     }
  258     *asc = '\0';
  259 }
  260 
  261 
  262 void
  263 convert_tex2iso(
  264     char *from,
  265     char *to)
  266 {
  267     const char *tex_to[TEX_SUBST];
  268     int i;
  269     size_t spaces = 0; /* spaces to add */
  270     size_t len, col = 0;    /* length of from, col counter */
  271     size_t subst_len;
  272     t_bool ex;
  273 
  274     /* initialize tex_to */
  275     memset(tex_to, '\0', sizeof(tex_to));
  276 
  277     /*
  278      * Charsets which have german umlauts incl. sharp s at the same
  279      * code position as ISO-8859-1
  280      * DEC-MCS, Windows-1252
  281      */
  282     if (IS_LOCAL_CHARSET("ISO-8859-1") ||
  283         IS_LOCAL_CHARSET("ISO-8859-2") ||
  284         IS_LOCAL_CHARSET("ISO-8859-3") ||
  285         IS_LOCAL_CHARSET("ISO-8859-4") ||
  286         IS_LOCAL_CHARSET("ISO-8859-9") ||
  287         IS_LOCAL_CHARSET("ISO-8859-10") ||
  288         IS_LOCAL_CHARSET("ISO-8859-13") ||
  289         IS_LOCAL_CHARSET("ISO-8859-14") ||
  290         IS_LOCAL_CHARSET("ISO-8859-15") ||
  291         IS_LOCAL_CHARSET("ISO-8859-16") ||
  292         iso2asc_supported >= 0) {
  293         tex_to[1] = tex_to[0] = "\344"; /* auml */
  294         tex_to[3] = tex_to[2] = "\366"; /* ouml */
  295         tex_to[5] = tex_to[4] = "\374"; /* uuml */
  296         tex_to[7] = tex_to[6] = "\304"; /* Auml */
  297         tex_to[9] = tex_to[8] = "\326"; /* Ouml */
  298         tex_to[11] = tex_to[10] = "\334";   /* Uuml */
  299         tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
  300     } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
  301         tex_to[1] = tex_to[0] = "\303\244"; /* auml */
  302         tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
  303         tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
  304         tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
  305         tex_to[9] = tex_to[8] = "\303\226"; /* Ouml */
  306         tex_to[11] = tex_to[10] = "\303\234";   /* Uuml */
  307         tex_to[14] = tex_to[13] = tex_to[12] = "\303\237";  /* szlig */
  308     } else {
  309         strcpy(to, from);
  310         return;
  311     }
  312 
  313     *to = '\0';
  314     len = strlen(from);
  315 
  316     while (col < len) {
  317         i = 0;
  318         ex = FALSE;
  319         while ((i < TEX_SUBST - 1) && !ex) {
  320             subst_len = strlen(tex_from[i]);
  321             if (!strncmp(from + col, tex_from[i], subst_len)) {
  322                 strcat(to, tex_to[i]);
  323                 spaces += subst_len - strlen(tex_to[i]);
  324                 col += subst_len - 1;
  325                 ex = TRUE;
  326             }
  327             i++;
  328         }
  329         if (!ex)
  330             strncat(to, from + col, 1);
  331         if (from[col] == ' ') {
  332             strncat(to, SPACES, spaces);
  333             spaces = 0;
  334         }
  335 
  336         col++;
  337     }
  338 }
  339 
  340 
  341 /*
  342  * Check for german TeX encoding in file open on fp
  343  */
  344 t_bool
  345 is_art_tex_encoded(
  346     FILE *fp)
  347 {
  348     char line[LEN];
  349     int i, len;
  350     t_bool body = FALSE;
  351 
  352     rewind(fp);
  353 
  354     while (fgets(line, (int) sizeof(line), fp) != NULL) {
  355         if (line[0] == '\n' && !body)
  356             body = TRUE;
  357         else if (!body)
  358             continue;
  359 
  360         i = 0;
  361 
  362         while (line[i++] == ' ')
  363             ;   /* search for first non blank */
  364 
  365         i--;
  366 
  367         if (!isalnum((unsigned char) line[i]) && line[i] != '\"')
  368             continue;   /* quoting char */
  369 
  370         len = strlen(line) - 1;
  371         for (i = 1; i < len; i++) {
  372             if (((line[i] == '\\') || (line[i] == '\"')) &&
  373                             (isalnum((unsigned char) line[i - 1])) &&
  374                             (isalnum((unsigned char) line[i + 1])))
  375                 return TRUE;
  376         }
  377     }
  378 
  379     return FALSE;
  380 }
  381 
  382 
  383 /*
  384  * Replace all non printable characters by '?'
  385  */
  386 char *
  387 convert_to_printable(
  388     char *buf,
  389     t_bool keep_tab)
  390 {
  391 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  392     char *buffer;
  393     wchar_t *wbuffer;
  394     size_t len = strlen(buf) + 1;
  395 
  396     if (IS_LOCAL_CHARSET("UTF-8"))
  397         utf8_valid(buf);
  398 
  399     if ((wbuffer = char2wchar_t(buf)) != NULL) {
  400         wconvert_to_printable(wbuffer, keep_tab);
  401 
  402         if ((buffer = wchar_t2char(wbuffer)) != NULL) {
  403             strncpy(buf, buffer, len);
  404             buf[len - 1] = '\0';
  405 
  406             free(buffer);
  407         }
  408         free(wbuffer);
  409     }
  410 #else
  411     unsigned char *c;
  412 
  413     for (c = (unsigned char *) buf; *c; c++) {
  414         if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
  415             *c = '?';
  416     }
  417 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
  418     return buf;
  419 }
  420 
  421 
  422 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  423 /*
  424  * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
  425  *        sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
  426  *        and thus loop termination fails.
  427  */
  428 wchar_t *
  429 wconvert_to_printable(
  430     wchar_t *wbuf,
  431     t_bool keep_tab)
  432 {
  433     wchar_t *wc;
  434 
  435     for (wc = wbuf; *wc; wc++) {
  436         if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
  437             *wc = (wchar_t) '?';
  438     }
  439 
  440     return wbuf;
  441 }
  442 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */