"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.2/src/charset.c" (8 Dec 2017, 12717 Bytes) of package /linux/misc/tin-2.4.2.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.1_vs_2.4.2.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : charset.c
    4  *  Author    : M. Kuhn, T. Burmester
    5  *  Created   : 1993-12-10
    6  *  Updated   : 2017-10-18
    7  *  Notes     : ISO to ascii charset conversion routines
    8  *
    9  * Copyright (c) 1993-2018 Markus Kuhn <mgk25@cl.cam.ac.uk>
   10  * All rights reserved.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. The name of the author may not be used to endorse or promote
   21  *    products derived from this software without specific prior written
   22  *    permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
   25  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   26  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
   28  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
   30  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   32  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   33  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   34  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   35  */
   36 
   37 
   38 #ifndef TIN_H
   39 #   include "tin.h"
   40 #endif /* !TIN_H */
   41 
   42 /*
   43  *  Table for the iso2asc conversion
   44  *  iso2asc  by  (unrza3@cd4680fs.rrze.uni-erlangen.de)
   45  *  included by  (root@aspic.han.de)
   46  */
   47 
   48 #define SUB "?"
   49 #define ISO_EXTRA   0xa0 /* beginning of second range of printable chars */
   50 
   51 /*
   52  * TABSTOP(x) is the column of the character after the TAB
   53  * at column x. First column is 0, of course.
   54  */
   55 
   56 #define TABSTOP(x)  (((x) - ((x)&7)) + 8)
   57 
   58 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
   59 {
   60     /* universal table for many languages */
   61     {
   62     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   63     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   64     "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
   65     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
   66     "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
   67     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
   68     },
   69     /* single-spacing universal table */
   70     {
   71     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
   72     " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
   73     "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
   74     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
   75     "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
   76     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
   77     },
   78     /* table for Danish, Dutch, German, Norwegian and Swedish */
   79     {
   80     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   81     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   82     "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
   83     "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
   84     "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
   85     "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
   86     },
   87     /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
   88     {
   89     " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   90     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   91     "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
   92     "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
   93     "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
   94     "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
   95     },
   96     /* table with RFC1345 codes in brackets */
   97     {
   98     "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
   99     "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
  100     "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
  101     "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
  102     "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
  103     "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
  104     "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
  105     "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
  106     "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
  107     "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
  108     "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
  109     "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
  110     },
  111     /* table for printers that allow overstriking with backspace */
  112     {
  113     " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
  114     "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
  115     " ","+\b_","2","3","'","u","P",".",
  116     ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
  117     "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
  118     "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
  119     "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
  120     "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
  121     "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
  122     "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
  123     "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
  124     "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
  125     },
  126     /* table for IBM PC character set (code page 437) */
  127     {
  128     "\377","\255","\233","\234",SUB,"\235","|","\25",
  129     "\"","(c)","\246","\256","\252","-","(R)","-",
  130     "\370","\361","\375","3","'","\346","\24","\371",
  131     ",","1","\247","\257","\254","\253"," 3/4","\250",
  132     "A","A","A","A","\216","\217","\222","\200",
  133     "E","\220","E","E","I","I","I","I",
  134     "D","\245","O","O","O","O","\231","x",
  135     "\355","U","U","U","\232","Y","T","\341",
  136     "\205","\240","\203","a","\204","\206","\221","\207",
  137     "\212","\202","\210","\211","\215","\241","\214","\213",
  138     "d","\244","\225","\242","\223","o","\224","\366",
  139     "\355","\227","\243","\226","\201","y","t","\230"
  140     }
  141 };
  142 
  143 /*
  144  * german tex style to latin1 conversion (by root@aspic, 12/04/93)
  145  */
  146 
  147 #define TEX_SUBST   15
  148 #define SPACES      "                                                                                                         "
  149 
  150 static const char *const tex_from[TEX_SUBST] =
  151 {
  152     "\"a","\\\"a","\"o","\\\"o","\"u","\\\"u","\"A","\\\"A","\"O","\\\"O","\"U","\\\"U","\"s","\\\"s","\\3"
  153 };
  154 
  155 /*
  156  *  Now the conversion function...
  157  */
  158 
  159 void
  160 convert_iso2asc(
  161     char *iso,
  162     char **asc_buffer,
  163     size_t *max_line_len,
  164     int t)
  165 {
  166     constext *p;
  167     constext *const *tab;
  168     char *asc;
  169     t_bool first;   /* flag for first SPACE/TAB after other characters */
  170     int i, a;   /* column counters in iso and asc */
  171 
  172     asc = *asc_buffer;
  173 
  174     if (iso == NULL || asc == NULL)
  175         return;
  176 
  177     tab = iso2asc[t];
  178     first = TRUE;
  179     i = a = 0;
  180     while (*iso != '\0') {
  181         if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
  182             p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
  183             iso++;
  184             i++;
  185             first = TRUE;
  186             while (*p) {
  187                 *(asc++) = *(p++);
  188                 if ((asc - *asc_buffer) >= (int) *max_line_len) {
  189                     int offset = asc - *asc_buffer;
  190                     *max_line_len += 64;
  191                     *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  192                     asc = *asc_buffer + offset;
  193                 }
  194                 a++;
  195             }
  196         } else {
  197             if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
  198                 /*
  199                  * spaces or TABS should be removed
  200                  */
  201                 if (*iso == ' ') {
  202                     /*
  203                      * only the first space after a letter must not be removed
  204                      */
  205                     if (first) {
  206                         *(asc++) = ' ';
  207                         a++;
  208                         first = FALSE;
  209                     }
  210                     i++;
  211                 } else {    /* here: *iso == '\t' */
  212                     if (a >= TABSTOP(i)) {
  213                         /*
  214                          * remove TAB or replace it with SPACE if necessary
  215                          */
  216                         if (first) {
  217                             *(asc++) = ' ';
  218                             a++;
  219                             first = FALSE;
  220                         }
  221                     } else {
  222                         /*
  223                          * TAB will correct the column difference
  224                          */
  225                         *(asc++) = '\t';    /* = *iso */
  226                         a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
  227                     }
  228                     i = TABSTOP(i);
  229                 }
  230                 iso++;
  231             } else {
  232                 /*
  233                  * just copy the characters and advance the column counters
  234                  */
  235                 if (*iso == '\t') {
  236                     a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
  237                 } else if (*iso == '\b') {
  238                     a--;
  239                     i--;
  240                 } else {
  241                     a++;
  242                     i++;
  243                 }
  244                 *(asc++) = *(iso++);
  245                 first = TRUE;
  246             }
  247         }
  248         if ((asc - *asc_buffer) >= (int) *max_line_len) {
  249             int offset = asc - *asc_buffer;
  250             *max_line_len += 64;
  251             *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  252             asc = *asc_buffer + offset;
  253         }
  254     }
  255     *asc = '\0';
  256 }
  257 
  258 
  259 void
  260 convert_tex2iso(
  261     char *from,
  262     char *to)
  263 {
  264     const char *tex_to[TEX_SUBST];
  265     int i;
  266     size_t spaces = 0; /* spaces to add */
  267     size_t len, col = 0;    /* length of from, col counter */
  268     size_t subst_len;
  269     t_bool ex;
  270 
  271     /* initialize tex_to */
  272     /*
  273      * Charsets which have german umlauts incl. sharp s at the same
  274      * code position as ISO-8859-1
  275      * DEC-MCS, Windows-1252
  276      */
  277     if (IS_LOCAL_CHARSET("ISO-8859-1") ||
  278         IS_LOCAL_CHARSET("ISO-8859-2") ||
  279         IS_LOCAL_CHARSET("ISO-8859-3") ||
  280         IS_LOCAL_CHARSET("ISO-8859-4") ||
  281         IS_LOCAL_CHARSET("ISO-8859-9") ||
  282         IS_LOCAL_CHARSET("ISO-8859-10") ||
  283         IS_LOCAL_CHARSET("ISO-8859-13") ||
  284         IS_LOCAL_CHARSET("ISO-8859-14") ||
  285         IS_LOCAL_CHARSET("ISO-8859-15") ||
  286         IS_LOCAL_CHARSET("ISO-8859-16") ||
  287         iso2asc_supported >= 0) {
  288         tex_to[1] = tex_to[0] = "\344"; /* auml */
  289         tex_to[3] = tex_to[2] = "\366"; /* ouml */
  290         tex_to[5] = tex_to[4] = "\374"; /* uuml */
  291         tex_to[7] = tex_to[6] = "\304"; /* Auml */
  292         tex_to[9] = tex_to[8] = "\326"; /* Ouml */
  293         tex_to[11] = tex_to[10] = "\334";   /* Uuml */
  294         tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
  295     } else if (IS_LOCAL_CHARSET("UTF-8")) { /* locale charset is UTF-8 */
  296         tex_to[1] = tex_to[0] = "\303\244"; /* auml */
  297         tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
  298         tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
  299         tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
  300         tex_to[9] = tex_to[8] = "\303\266"; /* Ouml */
  301         tex_to[11] = tex_to[10] = "\303\234";   /* Uuml */
  302         tex_to[14] = tex_to[13] = tex_to[12] = "\303\237";  /* szlig */
  303     } else {
  304         strcpy(to, from);
  305         return;
  306     }
  307 
  308     *to = '\0';
  309     len = strlen(from);
  310 
  311     while (col < len) {
  312         i = 0;
  313         ex = FALSE;
  314         while ((i < TEX_SUBST) && !ex) {
  315             subst_len = strlen(tex_from[i]);
  316             if (!strncmp(from + col, tex_from[i], subst_len)) {
  317                 strcat(to, tex_to[i]);
  318                 spaces += subst_len - strlen(tex_to[i]);
  319                 col += subst_len - 1;
  320                 ex = TRUE;
  321             }
  322             i++;
  323         }
  324         if (!ex)
  325             strncat(to, from + col, 1);
  326         if (from[col] == ' ') {
  327             strncat(to, SPACES, spaces);
  328             spaces = 0;
  329         }
  330 
  331         col++;
  332     }
  333 }
  334 
  335 
  336 /*
  337  * Check for german TeX encoding in file open on fp
  338  */
  339 t_bool
  340 is_art_tex_encoded(
  341     FILE *fp)
  342 {
  343     char line[LEN];
  344     int i, len;
  345     t_bool body = FALSE;
  346 
  347     rewind(fp);
  348 
  349     while (fgets(line, (int) sizeof(line), fp) != NULL) {
  350         if (line[0] == '\n' && !body)
  351             body = TRUE;
  352         else if (!body)
  353             continue;
  354 
  355         i = 0;
  356 
  357         while (line[i++] == ' ')
  358             ;   /* search for first non blank */
  359 
  360         i--;
  361 
  362         if (!isalnum((unsigned char) line[i]))
  363             continue;   /* quoting char */
  364 
  365         len = strlen(line) - 1;
  366         for (i = 1; i < len; i++) {
  367             if (((line[i] == '\\') || (line[i] == '\"')) &&
  368                             (isalnum((unsigned char) line[i - 1])) &&
  369                             (isalnum((unsigned char) line[i + 1])))
  370                 return TRUE;
  371         }
  372     }
  373 
  374     return FALSE;
  375 }
  376 
  377 
  378 /*
  379  * Replace all non printable characters by '?'
  380  */
  381 char *
  382 convert_to_printable(
  383     char *buf,
  384     t_bool keep_tab)
  385 {
  386 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  387     char *buffer;
  388     wchar_t *wbuffer;
  389     size_t len = strlen(buf) + 1;
  390 
  391     if (IS_LOCAL_CHARSET("UTF-8"))
  392         utf8_valid(buf);
  393 
  394     if ((wbuffer = char2wchar_t(buf)) != NULL) {
  395         wconvert_to_printable(wbuffer, keep_tab);
  396 
  397         if ((buffer = wchar_t2char(wbuffer)) != NULL) {
  398             strncpy(buf, buffer, len);
  399             buf[len - 1] = '\0';
  400 
  401             free(buffer);
  402         }
  403         free(wbuffer);
  404     }
  405 #else
  406     unsigned char *c;
  407 
  408     for (c = (unsigned char *) buf; *c; c++) {
  409         if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
  410             *c = '?';
  411     }
  412 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
  413     return buf;
  414 }
  415 
  416 
  417 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  418 /*
  419  * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
  420  *        sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
  421  *        and thus loop termination fails.
  422  */
  423 wchar_t *
  424 wconvert_to_printable(
  425     wchar_t *wbuf,
  426     t_bool keep_tab)
  427 {
  428     wchar_t *wc;
  429 
  430     for (wc = wbuf; *wc; wc++) {
  431         if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
  432             *wc = (wchar_t) '?';
  433     }
  434 
  435     return wbuf;
  436 }
  437 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */