"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.1/src/charset.c" (12 Oct 2016, 12936 Bytes) of package /linux/misc/tin-2.4.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.0_vs_2.4.1.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : charset.c
    4  *  Author    : M. Kuhn, T. Burmester
    5  *  Created   : 1993-12-10
    6  *  Updated   : 2016-03-10
    7  *  Notes     : ISO to ascii charset conversion routines
    8  *
    9  * Copyright (c) 1993-2017 Markus Kuhn <mgk25@cl.cam.ac.uk>
   10  * All rights reserved.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. The name of the author may not be used to endorse or promote
   21  *    products derived from this software without specific prior written
   22  *    permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
   25  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   26  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
   28  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
   30  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   32  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   33  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   34  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   35  */
   36 
   37 
   38 #ifndef TIN_H
   39 #   include "tin.h"
   40 #endif /* !TIN_H */
   41 
   42 /*
   43  *  Table for the iso2asc conversion
   44  *  iso2asc  by  (unrza3@cd4680fs.rrze.uni-erlangen.de)
   45  *  included by  (root@aspic.han.de)
   46  */
   47 
   48 #define SUB "?"
   49 #define ISO_EXTRA   0xa0 /* beginning of second range of printable chars */
   50 
   51 /*
   52  * TABSTOP(x) is the column of the character after the TAB
   53  * at column x. First column is 0, of course.
   54  */
   55 
   56 #define TABSTOP(x)  (((x) - ((x)&7)) + 8)
   57 
   58 static constext *const iso2asc[NUM_ISO_TABLES][256-ISO_EXTRA] =
   59 {
   60     /* universal table for many languages */
   61     {
   62     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   63     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   64     "A","A","A","A","A","A","AE","C","E","E","E","E","I","I","I","I",
   65     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","Th","ss",
   66     "a","a","a","a","a","a","ae","c","e","e","e","e","i","i","i","i",
   67     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","th","y"
   68     },
   69     /* single-spacing universal table */
   70     {
   71     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","c","a","<","-","-","R","-",
   72     " ",SUB,"2","3","'","u","P",".",",","1","o",">",SUB,SUB,SUB,"?",
   73     "A","A","A","A","A","A","A","C","E","E","E","E","I","I","I","I",
   74     "D","N","O","O","O","O","O","x","O","U","U","U","U","Y","T","s",
   75     "a","a","a","a","a","a","a","c","e","e","e","e","i","i","i","i",
   76     "d","n","o","o","o","o","o",":","o","u","u","u","u","y","t","y"
   77     },
   78     /* table for Danish, Dutch, German, Norwegian and Swedish */
   79     {
   80     " ","!","c",SUB,SUB,"Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   81     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   82     "A","A","A","A","Ae","Aa","AE","C","E","E","E","E","I","I","I","I",
   83     "D","N","O","O","O","O","Oe","x","Oe","U","U","U","Ue","Y","Th","ss",
   84     "a","a","a","a","ae","aa","ae","c","e","e","e","e","i","i","i","i",
   85     "d","n","o","o","o","o","oe",":","oe","u","u","u","ue","y","th","ij"
   86     },
   87     /* table for Danish, Finnish, Norwegian and Swedish, ISO 646 variant */
   88     {
   89     " ","!","c",SUB,"$","Y","|",SUB,"\"","(c)","a","<<","-","-","(R)","-",
   90     " ","+/-","2","3","'","u","P",".",",","1","o",">>"," 1/4"," 1/2"," 3/4","?",
   91     "A","A","A","A","[","]","[","C","E","@","E","E","I","I","I","I",
   92     "D","N","O","O","O","O","\\","x","\\","U","U","U","^","Y","Th","ss",
   93     "a","a","a","a","{","}","{","c","e","`","e","e","i","i","i","i",
   94     "d","n","o","o","o","o","|",":","|","u","u","u","~","y","th","y"
   95     },
   96     /* table with RFC1345 codes in brackets */
   97     {
   98     "[NS]","[!I]","[Ct]","[Pd]","[Cu]","[Ye]","[BB]","[SE]",
   99     "[':]","[Co]","[-a]","[<<]","[NO]","[--]","[Rg]","['-]",
  100     "[DG]","[+-]","[2S]","[3S]","['']","[My]","[PI]","[.M]",
  101     "[',]","[1S]","[-o]","[>>]","[14]","[12]","[34]","[?I]",
  102     "[A!]","[A']","[A>]","[A?]","[A:]","[AA]","[AE]","[C,]",
  103     "[E!]","[E']","[E>]","[E:]","[I!]","[I']","[I>]","[I:]",
  104     "[D-]","[N?]","[O!]","[O']","[O>]","[O?]","[O:]","[*X]",
  105     "[O/]","[U!]","[U']","[U>]","[U:]","[Y']","[TH]","[ss]",
  106     "[a!]","[a']","[a>]","[a?]","[a:]","[aa]","[ae]","[c,]",
  107     "[e!]","[e']","[e>]","[e:]","[i!]","[i']","[i>]","[i:]",
  108     "[d-]","[n?]","[o!]","[o']","[o>]","[o?]","[o:]","[-:]",
  109     "[o/]","[u!]","[u']","[u>]","[u:]","[y']","[th]","[y:]"
  110     },
  111     /* table for printers that allow overstriking with backspace */
  112     {
  113     " ","!","c\b|","L\b-","o\bX","Y\b=","|",SUB,
  114     "\"","(c)","a\b_","<<","-\b,","-","(R)","-",
  115     " ","+\b_","2","3","'","u","P",".",
  116     ",","1","o\b_",">>"," 1/4"," 1/2"," 3/4","?",
  117     "A\b`","A\b'","A\b^","A\b~","A\b\"","Aa","AE","C\b,",
  118     "E\b`","E\b'","E\b^","E\b\"","I\b`","I\b'","I\b^","I\b\"",
  119     "D\b-","N\b~","O\b`","O\b'","O\b^","O\b~","O\b\"","x",
  120     "O\b/","U\b`","U\b'","U\b^","U\b\"","Y\b'","Th","ss",
  121     "a\b`","a\b'","a\b^","a\b~","a\b\"","aa","ae","c\b,",
  122     "e\b`","e\b'","e\b^","e\b\"","i\b`","i\b'","i\b^","i\b\"",
  123     "d\b-","n\b~","o\b`","o\b'","o\b^","o\b~","o\b\"","-\b:",
  124     "o\b/","u\b`","u\b'","u\b^","u\b\"","y\b'","th","y\b\""
  125     },
  126     /* table for IBM PC character set (code page 437) */
  127     {
  128     "\377","\255","\233","\234",SUB,"\235","|","\25",
  129     "\"","(c)","\246","\256","\252","-","(R)","-",
  130     "\370","\361","\375","3","'","\346","\24","\371",
  131     ",","1","\247","\257","\254","\253"," 3/4","\250",
  132     "A","A","A","A","\216","\217","\222","\200",
  133     "E","\220","E","E","I","I","I","I",
  134     "D","\245","O","O","O","O","\231","x",
  135     "\355","U","U","U","\232","Y","T","\341",
  136     "\205","\240","\203","a","\204","\206","\221","\207",
  137     "\212","\202","\210","\211","\215","\241","\214","\213",
  138     "d","\244","\225","\242","\223","o","\224","\366",
  139     "\355","\227","\243","\226","\201","y","t","\230"
  140     }
  141 };
  142 
  143 /*
  144  * german tex style to latin1 conversion (by root@aspic, 12/04/93)
  145  */
  146 
  147 #define TEX_SUBST   15
  148 #define SPACES      "                                                                                                         "
  149 
  150 static const char *const tex_from[TEX_SUBST] =
  151 {
  152     "\"a","\\\"a","\"o","\\\"o","\"u","\\\"u","\"A","\\\"A","\"O","\\\"O","\"U","\\\"U","\"s","\\\"s","\\3"
  153 };
  154 
  155 /*
  156  *  Now the conversion function...
  157  */
  158 
  159 void
  160 convert_iso2asc(
  161     char *iso,
  162     char **asc_buffer,
  163     size_t *max_line_len,
  164     int t)
  165 {
  166     constext *p;
  167     constext *const *tab;
  168     char *asc;
  169     t_bool first;   /* flag for first SPACE/TAB after other characters */
  170     int i, a;   /* column counters in iso and asc */
  171 
  172     asc = *asc_buffer;
  173 
  174     if (iso == NULL || asc == NULL)
  175         return;
  176 
  177     tab = iso2asc[t];
  178     first = TRUE;
  179     i = a = 0;
  180     while (*iso != '\0') {
  181         if (*EIGHT_BIT(iso) >= ISO_EXTRA) {
  182             p = tab[*EIGHT_BIT(iso) - ISO_EXTRA];
  183             iso++;
  184             i++;
  185             first = TRUE;
  186             while (*p) {
  187                 *(asc++) = *(p++);
  188                 if ((asc - *asc_buffer) >= (int) *max_line_len) {
  189                     int offset = asc - *asc_buffer;
  190                     *max_line_len += 64;
  191                     *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  192                     asc = *asc_buffer + offset;
  193                 }
  194                 a++;
  195             }
  196         } else {
  197             if (a > i && ((*iso == ' ') || (*iso == '\t'))) {
  198                 /*
  199                  * spaces or TABS should be removed
  200                  */
  201                 if (*iso == ' ') {
  202                     /*
  203                      * only the first space after a letter must not be removed
  204                      */
  205                     if (first) {
  206                         *(asc++) = ' ';
  207                         a++;
  208                         first = FALSE;
  209                     }
  210                     i++;
  211                 } else {    /* here: *iso == '\t' */
  212                     if (a >= TABSTOP(i)) {
  213                         /*
  214                          * remove TAB or replace it with SPACE if necessary
  215                          */
  216                         if (first) {
  217                             *(asc++) = ' ';
  218                             a++;
  219                             first = FALSE;
  220                         }
  221                     } else {
  222                         /*
  223                          * TAB will correct the column difference
  224                          */
  225                         *(asc++) = '\t';    /* = *iso */
  226                         a = TABSTOP(a); /* = TABSTOP(i), because i < a < TABSTOP(i) */
  227                     }
  228                     i = TABSTOP(i);
  229                 }
  230                 iso++;
  231             } else {
  232                 /*
  233                  * just copy the characters and advance the column counters
  234                  */
  235                 if (*iso == '\t') {
  236                     a = i = TABSTOP(i); /* = TABSTOP(a), because here a = i */
  237                 } else if (*iso == '\b') {
  238                     a--;
  239                     i--;
  240                 } else {
  241                     a++;
  242                     i++;
  243                 }
  244                 *(asc++) = *(iso++);
  245                 first = TRUE;
  246             }
  247         }
  248         if ((asc - *asc_buffer) >= (int) *max_line_len) {
  249             int offset = asc - *asc_buffer;
  250             *max_line_len += 64;
  251             *asc_buffer = my_realloc(*asc_buffer, *max_line_len);
  252             asc = *asc_buffer + offset;
  253         }
  254     }
  255     *asc = '\0';
  256 
  257     return;
  258 }
  259 
  260 
  261 void
  262 convert_tex2iso(
  263     char *from,
  264     char *to)
  265 {
  266     const char *tex_to[TEX_SUBST];
  267     int i;
  268     size_t spaces = 0; /* spaces to add */
  269     size_t len, col = 0;    /* length of from, col counter */
  270     size_t subst_len;
  271     t_bool ex;
  272 
  273     /* initialize tex_to */
  274     /*
  275      * Charsets which have german umlauts incl. sharp s at the same
  276      * code position as ISO-8859-1
  277      * DEC-MCS, Windows-1252
  278      */
  279     if (!strcasecmp(tinrc.mm_local_charset, "ISO-8859-1") ||
  280         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-2") ||
  281         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-3") ||
  282         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-4") ||
  283         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-9") ||
  284         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-10") ||
  285         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-13") ||
  286         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-14") ||
  287         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-15") ||
  288         !strcasecmp(tinrc.mm_local_charset, "ISO-8859-16") ||
  289         iso2asc_supported >= 0) {
  290         tex_to[1] = tex_to[0] = "\344"; /* auml */
  291         tex_to[3] = tex_to[2] = "\366"; /* ouml */
  292         tex_to[5] = tex_to[4] = "\374"; /* uuml */
  293         tex_to[7] = tex_to[6] = "\304"; /* Auml */
  294         tex_to[9] = tex_to[8] = "\326"; /* Ouml */
  295         tex_to[11] = tex_to[10] = "\334";   /* Uuml */
  296         tex_to[14] = tex_to[13] = tex_to[12] = "\337"; /* szlig */
  297     } else if (!strcasecmp(tinrc.mm_local_charset, "UTF-8")) { /* locale charset is UTF-8 */
  298         tex_to[1] = tex_to[0] = "\303\244"; /* auml */
  299         tex_to[3] = tex_to[2] = "\303\266"; /* ouml */
  300         tex_to[5] = tex_to[4] = "\303\274"; /* uuml */
  301         tex_to[7] = tex_to[6] = "\303\204"; /* Auml */
  302         tex_to[9] = tex_to[8] = "\303\266"; /* Ouml */
  303         tex_to[11] = tex_to[10] = "\303\234";   /* Uuml */
  304         tex_to[14] = tex_to[13] = tex_to[12] = "\303\237";  /* szlig */
  305     } else {
  306         strcpy(to, from);
  307         return;
  308     }
  309 
  310     *to = '\0';
  311     len = strlen(from);
  312 
  313     while (col < len) {
  314         i = 0;
  315         ex = FALSE;
  316         while ((i < TEX_SUBST) && !ex) {
  317             subst_len = strlen(tex_from[i]);
  318             if (!strncmp(from + col, tex_from[i], subst_len)) {
  319                 strcat(to, tex_to[i]);
  320                 spaces += subst_len - strlen(tex_to[i]);
  321                 col += subst_len - 1;
  322                 ex = TRUE;
  323             }
  324             i++;
  325         }
  326         if (!ex)
  327             strncat(to, from + col, 1);
  328         if (from[col] == ' ') {
  329             strncat(to, SPACES, spaces);
  330             spaces = 0;
  331         }
  332 
  333         col++;
  334     }
  335 }
  336 
  337 
  338 /*
  339  * Check for german TeX encoding in file open on fp
  340  */
  341 t_bool
  342 is_art_tex_encoded(
  343     FILE *fp)
  344 {
  345     char line[LEN];
  346     int i, len;
  347     t_bool body = FALSE;
  348 
  349     rewind(fp);
  350 
  351     while (fgets(line, (int) sizeof(line), fp) != NULL) {
  352         if (line[0] == '\n' && !body)
  353             body = TRUE;
  354         else if (!body)
  355             continue;
  356 
  357         i = 0;
  358 
  359         while (line[i++] == ' ')
  360             ;   /* search for first non blank */
  361 
  362         i--;
  363 
  364         if (!isalnum((unsigned char) line[i]))
  365             continue;   /* quoting char */
  366 
  367         len = strlen(line) - 1;
  368         for (i = 1; i < len; i++) {
  369             if (((line[i] == '\\') || (line[i] == '\"')) &&
  370                             (isalnum((unsigned char) line[i - 1])) &&
  371                             (isalnum((unsigned char) line[i + 1])))
  372                 return TRUE;
  373         }
  374     }
  375 
  376     return FALSE;
  377 }
  378 
  379 
  380 /*
  381  * Replace all non printable characters by '?'
  382  */
  383 char *
  384 convert_to_printable(
  385     char *buf,
  386     t_bool keep_tab)
  387 {
  388 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  389     char *buffer;
  390     wchar_t *wbuffer;
  391     size_t len = strlen(buf) + 1;
  392 
  393     if (IS_LOCAL_CHARSET("UTF-8"))
  394         utf8_valid(buf);
  395 
  396     if ((wbuffer = char2wchar_t(buf)) != NULL) {
  397         wconvert_to_printable(wbuffer, keep_tab);
  398 
  399         if ((buffer = wchar_t2char(wbuffer)) != NULL) {
  400             strncpy(buf, buffer, len);
  401             buf[len - 1] = '\0';
  402 
  403             free(buffer);
  404         }
  405         free(wbuffer);
  406     }
  407 #else
  408     unsigned char *c;
  409 
  410     for (c = (unsigned char *) buf; *c; c++) {
  411         if (!my_isprint(*c) && !(keep_tab && *c == '\t'))
  412             *c = '?';
  413     }
  414 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
  415     return buf;
  416 }
  417 
  418 
  419 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
  420 /*
  421  * NOTES: don't make wc a wint_t as libutf8 (at least version 0.8)
  422  *        sometimes fails to proper convert (wchar_t) 0 to (wint_t) 0
  423  *        and thus loop termination fails.
  424  */
  425 wchar_t *
  426 wconvert_to_printable(
  427     wchar_t *wbuf,
  428     t_bool keep_tab)
  429 {
  430     wchar_t *wc;
  431 
  432     for (wc = wbuf; *wc; wc++) {
  433         if (!iswprint((wint_t) *wc) && !(keep_tab && *wc == (wchar_t) '\t'))
  434             *wc = (wchar_t) '?';
  435     }
  436 
  437     return wbuf;
  438 }
  439 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */