"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.4/src/charset.c" (20 Nov 2019, 12838 Bytes) of package /linux/misc/tin-2.4.4.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.3_vs_2.4.4.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : charset.c
    4  *  Author    : M. Kuhn, T. Burmester
    5  *  Created   : 1993-12-10
    6  *  Updated   : 2019-07-05
    7  *  Notes     : ISO to ascii charset conversion routines
    8  *
    9  * Copyright (c) 1993-2020 Markus Kuhn <mgk25@cl.cam.ac.uk>
   10  * All rights reserved.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  *
   16  * 1. Redistributions of source code must retain the above copyright notice,
   17  *    this list of conditions and the following disclaimer.
   18  *
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  *
   23  * 3. Neither the name of the copyright holder nor the names of its
   24  *    contributors may be used to endorse or promote products derived from
   25  *    this software without specific prior written permission.
   26  *
   27  * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   28  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
   31  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37  * POSSIBILITY OF SUCH DAMAGE.
   38  */
   39 
   40 
   41 #ifndef TIN_H
   42 #   include "tin.h"
   43 #endif /* !TIN_H */
   44 
   45 /*
   46  *  Table for the iso2asc conversion
   47  *  iso2asc  by  (unrza3@cd4680fs.rrze.uni-erlangen.de)
   48  *  included by  (root@aspic.han.de)
   49  */
   50 
   51 #define SUB "?"
   52 #define ISO_EXTRA   0xa0 /* beginning of second range of printable chars */
   53 
   54 /*
   55  * TABSTOP(x) is the column of the character after the TAB
   56  * at column x. First column is 0, of course.
   57  */
   58 
   59 #define TABSTOP(x)  (((x) - ((x)&7)) + 8)
   60 
   61 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
   62 {
   63     /* universal table for many languages */
   64     {
   65     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   66     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   67     "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
   68     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
   69     "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
   70     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
   71     },
   72     /* single-spacing universal table */
   73     {
   74     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
   75     " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
   76     "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
   77     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
   78     "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
   79     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
   80     },
   81     /* table for Danish, Dutch, German, Norwegian and Swedish */
   82     {
   83     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   84     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   85     "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
   86     "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
   87     "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
   88     "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
   89     },
   90     /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
   91     {
   92     " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   93     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   94     "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
   95     "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
   96     "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
   97     "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
   98     },
   99     /* table with RFC1345 codes in brackets */
  100     {
  101     "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
  102     "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
  103     "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
  104     "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
  105     "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
  106     "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
  107     "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
  108     "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
  109     "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
  110     "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
  111     "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
  112     "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
  113     },
  114     /* table for printers that allow overstriking with backspace */
  115     {
  116     " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
  117     "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
  118     " ","+\b_","2","3","'","u","P",".",
  119     ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
  120     "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
  121     "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
  122     "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
  123     "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
  124     "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
  125     "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
  126     "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
  127     "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
  128     },
  129     /* table for IBM PC character set (code page 437) */
  130     {
  131     "\377","\255","\233","\234",SUB,"\235","|","\25",
  132     "\"","(c)","\246","\256","\252","-","(R)","-",
  133     "\370","\361","\375","3","'","\346","\24","\371",
  134     ",","1","\247","\257","\254","\253"," 3/4","\250",
  135     "A","A","A","A","\216","\217","\222","\200",
  136     "E","\220","E","E","I","I","I","I",
  137     "D","\245","O","O","O","O","\231","x",
  138     "\355","U","U","U","\232","Y","T","\341",
  139     "\205","\240","\203","a","\204","\206","\221","\207",
  140     "\212","\202","\210","\211","\215","\241","\214","\213",
  141     "d","\244","\225","\242","\223","o","\224","\366",
  142     "\355","\227","\243","\226","\201","y","t","\230"
  143     }
  144 };
  145 
  146 /*
  147  * german tex style to latin1 conversion (by root@aspic, 12/04/93)
  148  */
  149 
  150 #define TEX_SUBST   15
  151 #define SPACES      "                                                                                                         "
  152 
  153 static const char *const tex_from[TEX_SUBST] =
  154 {
  155     "\"a","\\\"a","\"o","\\\"o","\"u","\\\"u","\"A","\\\"A","\"O","\\\"O","\"U","\\\"U","\"s","\\\"s","\\3"
  156 };
  157 
  158 /*
  159  *  Now the conversion function...
  160  */
  161 
  162 void
  163 convert_iso2asc(
  164     char *iso,
  165     char **asc_buffer,
  166     size_t *max_line_len,
  167     int t)
  168 {
  169     constext *p;
  170     constext *const *tab;
  171     char *asc;
  172     t_bool first;   /* flag for first SPACE/TAB after other characters */
  173     int i, a;   /* column counters in iso and asc */
  174 
  175     asc = *asc_buffer;
  176 
  177     if (iso == NULL || asc == NULL)
  178         return;
  179 
  180     tab = iso2asc[t];
  181     first = TRUE;
  182     i = a = 0;
  183     while (*iso != '\0') {
  184         if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
  185             p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
  186             iso++;
  187             i++;
  188             first = TRUE;
  189             while (*p) {
  190                 *(asc++) = *(p++);
  191                 if ((asc - *asc_buffer) >= (int) *max_line_len) {
  192                     int offset = asc - *asc_buffer;
  193                     *max_line_len += 64;
  194                     *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  195                     asc = *asc_buffer + offset;
  196                 }
  197                 a++;
  198             }
  199         } else {
  200             if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
  201                 /*
  202                  * spaces or TABS should be removed
  203                  */
  204                 if (*iso == ' ') {
  205                     /*
  206                      * only the first space after a letter must not be removed
  207                      */
  208                     if (first) {
  209                         *(asc++) = ' ';
  210                         a++;
  211                         first = FALSE;
  212                     }
  213                     i++;
  214                 } else {    /* here: *iso == '\t' */
  215                     if (a >= TABSTOP(i)) {
  216                         /*
  217                          * remove TAB or replace it with SPACE if necessary
  218                          */
  219                         if (first) {
  220                             *(asc++) = ' ';
  221                             a++;
  222                             first = FALSE;
  223                         }
  224                     } else {
  225                         /*
  226                          * TAB will correct the column difference
  227                          */
  228                         *(asc++) = '\t';    /* = *iso */
  229                         a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
  230                     }
  231                     i = TABSTOP(i);
  232                 }
  233                 iso++;
  234             } else {
  235                 /*
  236                  * just copy the characters and advance the column counters
  237                  */
  238                 if (*iso == '\t') {
  239                     a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
  240                 } else if (*iso == '\b') {
  241                     a--;
  242                     i--;
  243                 } else {
  244                     a++;
  245                     i++;
  246                 }
  247                 *(asc++) = *(iso++);
  248                 first = TRUE;
  249             }
  250         }
  251         if ((asc - *asc_buffer) >= (int) *max_line_len) {
  252             int offset = asc - *asc_buffer;
  253             *max_line_len += 64;
  254             *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  255             asc = *asc_buffer + offset;
  256         }
  257     }
  258     *asc = '\0';
  259 }
  260 
  261 
  262 void
  263 convert_tex2iso(
  264     char *from,
  265     char *to)
  266 {
  267     const char *tex_to[TEX_SUBST];
  268     int i;
  269     size_t spaces = 0; /* spaces to add */
  270     size_t len, col = 0;    /* length of from, col counter */
  271     size_t subst_len;
  272     t_bool ex;
  273 
  274     /* initialize tex_to */
  275     /*
  276      * Charsets which have german umlauts incl. sharp s at the same
  277      * code position as ISO-8859-1
  278      * DEC-MCS, Windows-1252
  279      */
  280     if (IS_LOCAL_CHARSET("ISO-8859-1") ||
  281         IS_LOCAL_CHARSET("ISO-8859-2") ||
  282         IS_LOCAL_CHARSET("ISO-8859-3") ||
  283         IS_LOCAL_CHARSET("ISO-8859-4") ||
  284         IS_LOCAL_CHARSET("ISO-8859-9") ||
  285         IS_LOCAL_CHARSET("ISO-8859-10") ||
  286         IS_LOCAL_CHARSET("ISO-8859-13") ||
  287         IS_LOCAL_CHARSET("ISO-8859-14") ||
  288         IS_LOCAL_CHARSET("ISO-8859-15") ||
  289         IS_LOCAL_CHARSET("ISO-8859-16") ||
  290         iso2asc_supported >= 0) {
  291         tex_to[1] = tex_to[0] = "\344"; /* auml */
  292         tex_to[3] = tex_to[2] = "\366"; /* ouml */
  293         tex_to[5] = tex_to[4] = "\374"; /* uuml */
  294         tex_to[7] = tex_to[6] = "\304"; /* Auml */
  295         tex_to[9] = tex_to[8] = "\326"; /* Ouml */
  296         tex_to[11] = tex_to[10] = "\334";   /* Uuml */
  297         tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
  298     } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
  299         tex_to[1] = tex_to[0] = "\303\244"; /* auml */
  300         tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
  301         tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
  302         tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
  303         tex_to[9] = tex_to[8] = "\303\266"; /* Ouml */
  304         tex_to[11] = tex_to[10] = "\303\234";   /* Uuml */
  305         tex_to[14] = tex_to[13] = tex_to[12] = "\303\237";  /* szlig */
  306     } else {
  307         strcpy(to, from);
  308         return;
  309     }
  310 
  311     *to = '\0';
  312     len = strlen(from);
  313 
  314     while (col < len) {
  315         i = 0;
  316         ex = FALSE;
  317         while ((i < TEX_SUBST) && !ex) {
  318             subst_len = strlen(tex_from[i]);
  319             if (!strncmp(from + col, tex_from[i], subst_len)) {
  320                 strcat(to, tex_to[i]);
  321                 spaces += subst_len - strlen(tex_to[i]);
  322                 col += subst_len - 1;
  323                 ex = TRUE;
  324             }
  325             i++;
  326         }
  327         if (!ex)
  328             strncat(to, from + col, 1);
  329         if (from[col] == ' ') {
  330             strncat(to, SPACES, spaces);
  331             spaces = 0;
  332         }
  333 
  334         col++;
  335     }
  336 }
  337 
  338 
  339 /*
  340  * Check for german TeX encoding in file open on fp
  341  */
  342 t_bool
  343 is_art_tex_encoded(
  344     FILE *fp)
  345 {
  346     char line[LEN];
  347     int i, len;
  348     t_bool body = FALSE;
  349 
  350     rewind(fp);
  351 
  352     while (fgets(line, (int) sizeof(line), fp) != NULL) {
  353         if (line[0] == '\n' && !body)
  354             body = TRUE;
  355         else if (!body)
  356             continue;
  357 
  358         i = 0;
  359 
  360         while (line[i++] == ' ')
  361             ;   /* search for first non blank */
  362 
  363         i--;
  364 
  365         if (!isalnum((unsigned char) line[i]) && line[i] != '\"')
  366             continue;   /* quoting char */
  367 
  368         len = strlen(line) - 1;
  369         for (i = 1; i < len; i++) {
  370             if (((line[i] == '\\') || (line[i] == '\"')) &&
  371                             (isalnum((unsigned char) line[i - 1])) &&
  372                             (isalnum((unsigned char) line[i + 1])))
  373                 return TRUE;
  374         }
  375     }
  376 
  377     return FALSE;
  378 }
  379 
  380 
  381 /*
  382  * Replace all non printable characters by '?'
  383  */
  384 char *
  385 convert_to_printable(
  386     char *buf,
  387     t_bool keep_tab)
  388 {
  389 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  390     char *buffer;
  391     wchar_t *wbuffer;
  392     size_t len = strlen(buf) + 1;
  393 
  394     if (IS_LOCAL_CHARSET("UTF-8"))
  395         utf8_valid(buf);
  396 
  397     if ((wbuffer = char2wchar_t(buf)) != NULL) {
  398         wconvert_to_printable(wbuffer, keep_tab);
  399 
  400         if ((buffer = wchar_t2char(wbuffer)) != NULL) {
  401             strncpy(buf, buffer, len);
  402             buf[len - 1] = '\0';
  403 
  404             free(buffer);
  405         }
  406         free(wbuffer);
  407     }
  408 #else
  409     unsigned char *c;
  410 
  411     for (c = (unsigned char *) buf; *c; c++) {
  412         if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
  413             *c = '?';
  414     }
  415 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
  416     return buf;
  417 }
  418 
  419 
  420 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  421 /*
  422  * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
  423  *        sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
  424  *        and thus loop termination fails.
  425  */
  426 wchar_t *
  427 wconvert_to_printable(
  428     wchar_t *wbuf,
  429     t_bool keep_tab)
  430 {
  431     wchar_t *wc;
  432 
  433     for (wc = wbuf; *wc; wc++) {
  434         if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
  435             *wc = (wchar_t) '?';
  436     }
  437 
  438     return wbuf;
  439 }
  440 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */