"Fossies" - the Fresh Open Source Software Archive

Member "sitecopy-0.16.6/intl/localcharset.c" (18 Oct 2006, 12492 Bytes) of archive /linux/www/sitecopy-0.16.6.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "localcharset.c" see the Fossies "Dox" file reference documentation.

    1 /* Determine a canonical name for the current locale's character encoding.
    2 
    3    Copyright (C) 2000-2006 Free Software Foundation, Inc.
    4 
    5    This program is free software; you can redistribute it and/or modify it
    6    under the terms of the GNU Library General Public License as published
    7    by the Free Software Foundation; either version 2, or (at your option)
    8    any later version.
    9 
   10    This program is distributed in the hope that it will be useful,
   11    but WITHOUT ANY WARRANTY; without even the implied warranty of
   12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13    Library General Public License for more details.
   14 
   15    You should have received a copy of the GNU Library General Public
   16    License along with this program; if not, write to the Free Software
   17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
   18    USA.  */
   19 
   20 /* Written by Bruno Haible <bruno@clisp.org>.  */
   21 
   22 #include <config.h>
   23 
   24 /* Specification.  */
   25 #include "localcharset.h"
   26 
   27 #include <stddef.h>
   28 #include <stdio.h>
   29 #include <string.h>
   30 #include <stdlib.h>
   31 
   32 #if defined _WIN32 || defined __WIN32__
   33 # define WIN32_NATIVE
   34 #endif
   35 
   36 #if defined __EMX__
   37 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
   38 # define OS2
   39 #endif
   40 
   41 #if !defined WIN32_NATIVE
   42 # if HAVE_LANGINFO_CODESET
   43 #  include <langinfo.h>
   44 # else
   45 #  if 0 /* see comment below */
   46 #   include <locale.h>
   47 #  endif
   48 # endif
   49 # ifdef __CYGWIN__
   50 #  define WIN32_LEAN_AND_MEAN
   51 #  include <windows.h>
   52 # endif
   53 #elif defined WIN32_NATIVE
   54 # define WIN32_LEAN_AND_MEAN
   55 # include <windows.h>
   56 #endif
   57 #if defined OS2
   58 # define INCL_DOS
   59 # include <os2.h>
   60 #endif
   61 
   62 #if ENABLE_RELOCATABLE
   63 # include "relocatable.h"
   64 #else
   65 # define relocate(pathname) (pathname)
   66 #endif
   67 
   68 /* Get LIBDIR.  */
   69 #ifndef LIBDIR
   70 # include "configmake.h"
   71 #endif
   72 
   73 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
   74   /* Win32, Cygwin, OS/2, DOS */
   75 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
   76 #endif
   77 
   78 #ifndef DIRECTORY_SEPARATOR
   79 # define DIRECTORY_SEPARATOR '/'
   80 #endif
   81 
   82 #ifndef ISSLASH
   83 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
   84 #endif
   85 
   86 #if HAVE_DECL_GETC_UNLOCKED
   87 # undef getc
   88 # define getc getc_unlocked
   89 #endif
   90 
   91 /* The following static variable is declared 'volatile' to avoid a
   92    possible multithread problem in the function get_charset_aliases. If we
   93    are running in a threaded environment, and if two threads initialize
   94    'charset_aliases' simultaneously, both will produce the same value,
   95    and everything will be ok if the two assignments to 'charset_aliases'
   96    are atomic. But I don't know what will happen if the two assignments mix.  */
   97 #if __STDC__ != 1
   98 # define volatile /* empty */
   99 #endif
  100 /* Pointer to the contents of the charset.alias file, if it has already been
  101    read, else NULL.  Its format is:
  102    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
  103 static const char * volatile charset_aliases;
  104 
  105 /* Return a pointer to the contents of the charset.alias file.  */
  106 static const char *
  107 get_charset_aliases (void)
  108 {
  109   const char *cp;
  110 
  111   cp = charset_aliases;
  112   if (cp == NULL)
  113     {
  114 #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
  115       FILE *fp;
  116       const char *dir;
  117       const char *base = "charset.alias";
  118       char *file_name;
  119 
  120       /* Make it possible to override the charset.alias location.  This is
  121      necessary for running the testsuite before "make install".  */
  122       dir = getenv ("CHARSETALIASDIR");
  123       if (dir == NULL || dir[0] == '\0')
  124     dir = relocate (LIBDIR);
  125 
  126       /* Concatenate dir and base into freshly allocated file_name.  */
  127       {
  128     size_t dir_len = strlen (dir);
  129     size_t base_len = strlen (base);
  130     int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
  131     file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
  132     if (file_name != NULL)
  133       {
  134         memcpy (file_name, dir, dir_len);
  135         if (add_slash)
  136           file_name[dir_len] = DIRECTORY_SEPARATOR;
  137         memcpy (file_name + dir_len + add_slash, base, base_len + 1);
  138       }
  139       }
  140 
  141       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
  142     /* Out of memory or file not found, treat it as empty.  */
  143     cp = "";
  144       else
  145     {
  146       /* Parse the file's contents.  */
  147       char *res_ptr = NULL;
  148       size_t res_size = 0;
  149 
  150       for (;;)
  151         {
  152           int c;
  153           char buf1[50+1];
  154           char buf2[50+1];
  155           size_t l1, l2;
  156           char *old_res_ptr;
  157 
  158           c = getc (fp);
  159           if (c == EOF)
  160         break;
  161           if (c == '\n' || c == ' ' || c == '\t')
  162         continue;
  163           if (c == '#')
  164         {
  165           /* Skip comment, to end of line.  */
  166           do
  167             c = getc (fp);
  168           while (!(c == EOF || c == '\n'));
  169           if (c == EOF)
  170             break;
  171           continue;
  172         }
  173           ungetc (c, fp);
  174           if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
  175         break;
  176           l1 = strlen (buf1);
  177           l2 = strlen (buf2);
  178           old_res_ptr = res_ptr;
  179           if (res_size == 0)
  180         {
  181           res_size = l1 + 1 + l2 + 1;
  182           res_ptr = (char *) malloc (res_size + 1);
  183         }
  184           else
  185         {
  186           res_size += l1 + 1 + l2 + 1;
  187           res_ptr = (char *) realloc (res_ptr, res_size + 1);
  188         }
  189           if (res_ptr == NULL)
  190         {
  191           /* Out of memory. */
  192           res_size = 0;
  193           if (old_res_ptr != NULL)
  194             free (old_res_ptr);
  195           break;
  196         }
  197           strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
  198           strcpy (res_ptr + res_size - (l2 + 1), buf2);
  199         }
  200       fclose (fp);
  201       if (res_size == 0)
  202         cp = "";
  203       else
  204         {
  205           *(res_ptr + res_size) = '\0';
  206           cp = res_ptr;
  207         }
  208     }
  209 
  210       if (file_name != NULL)
  211     free (file_name);
  212 
  213 #else
  214 
  215 # if defined VMS
  216       /* To avoid the troubles of an extra file charset.alias_vms in the
  217      sources of many GNU packages, simply inline the aliases here.  */
  218       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
  219      "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
  220      section 10.7 "Handling Different Character Sets".  */
  221       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
  222        "ISO8859-2" "\0" "ISO-8859-2" "\0"
  223        "ISO8859-5" "\0" "ISO-8859-5" "\0"
  224        "ISO8859-7" "\0" "ISO-8859-7" "\0"
  225        "ISO8859-8" "\0" "ISO-8859-8" "\0"
  226        "ISO8859-9" "\0" "ISO-8859-9" "\0"
  227        /* Japanese */
  228        "eucJP" "\0" "EUC-JP" "\0"
  229        "SJIS" "\0" "SHIFT_JIS" "\0"
  230        "DECKANJI" "\0" "DEC-KANJI" "\0"
  231        "SDECKANJI" "\0" "EUC-JP" "\0"
  232        /* Chinese */
  233        "eucTW" "\0" "EUC-TW" "\0"
  234        "DECHANYU" "\0" "DEC-HANYU" "\0"
  235        "DECHANZI" "\0" "GB2312" "\0"
  236        /* Korean */
  237        "DECKOREAN" "\0" "EUC-KR" "\0";
  238 # endif
  239 
  240 # if defined WIN32_NATIVE || defined __CYGWIN__
  241       /* To avoid the troubles of installing a separate file in the same
  242      directory as the DLL and of retrieving the DLL's directory at
  243      runtime, simply inline the aliases here.  */
  244 
  245       cp = "CP936" "\0" "GBK" "\0"
  246        "CP1361" "\0" "JOHAB" "\0"
  247        "CP20127" "\0" "ASCII" "\0"
  248        "CP20866" "\0" "KOI8-R" "\0"
  249        "CP20936" "\0" "GB2312" "\0"
  250        "CP21866" "\0" "KOI8-RU" "\0"
  251        "CP28591" "\0" "ISO-8859-1" "\0"
  252        "CP28592" "\0" "ISO-8859-2" "\0"
  253        "CP28593" "\0" "ISO-8859-3" "\0"
  254        "CP28594" "\0" "ISO-8859-4" "\0"
  255        "CP28595" "\0" "ISO-8859-5" "\0"
  256        "CP28596" "\0" "ISO-8859-6" "\0"
  257        "CP28597" "\0" "ISO-8859-7" "\0"
  258        "CP28598" "\0" "ISO-8859-8" "\0"
  259        "CP28599" "\0" "ISO-8859-9" "\0"
  260        "CP28605" "\0" "ISO-8859-15" "\0"
  261        "CP38598" "\0" "ISO-8859-8" "\0"
  262        "CP51932" "\0" "EUC-JP" "\0"
  263        "CP51936" "\0" "GB2312" "\0"
  264        "CP51949" "\0" "EUC-KR" "\0"
  265        "CP51950" "\0" "EUC-TW" "\0"
  266        "CP54936" "\0" "GB18030" "\0"
  267        "CP65001" "\0" "UTF-8" "\0";
  268 # endif
  269 #endif
  270 
  271       charset_aliases = cp;
  272     }
  273 
  274   return cp;
  275 }
  276 
  277 /* Determine the current locale's character encoding, and canonicalize it
  278    into one of the canonical names listed in config.charset.
  279    The result must not be freed; it is statically allocated.
  280    If the canonical name cannot be determined, the result is a non-canonical
  281    name.  */
  282 
  283 #ifdef STATIC
  284 STATIC
  285 #endif
  286 const char *
  287 locale_charset (void)
  288 {
  289   const char *codeset;
  290   const char *aliases;
  291 
  292 #if !(defined WIN32_NATIVE || defined OS2)
  293 
  294 # if HAVE_LANGINFO_CODESET
  295 
  296   /* Most systems support nl_langinfo (CODESET) nowadays.  */
  297   codeset = nl_langinfo (CODESET);
  298 
  299 #  ifdef __CYGWIN__
  300   /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
  301      returns "US-ASCII".  As long as this is not fixed, return the suffix
  302      of the locale name from the environment variables (if present) or
  303      the codepage as a number.  */
  304   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
  305     {
  306       const char *locale;
  307       static char buf[2 + 10 + 1];
  308 
  309       locale = getenv ("LC_ALL");
  310       if (locale == NULL || locale[0] == '\0')
  311     {
  312       locale = getenv ("LC_CTYPE");
  313       if (locale == NULL || locale[0] == '\0')
  314         locale = getenv ("LANG");
  315     }
  316       if (locale != NULL && locale[0] != '\0')
  317     {
  318       /* If the locale name contains an encoding after the dot, return
  319          it.  */
  320       const char *dot = strchr (locale, '.');
  321 
  322       if (dot != NULL)
  323         {
  324           const char *modifier;
  325 
  326           dot++;
  327           /* Look for the possible @... trailer and remove it, if any.  */
  328           modifier = strchr (dot, '@');
  329           if (modifier == NULL)
  330         return dot;
  331           if (modifier - dot < sizeof (buf))
  332         {
  333           memcpy (buf, dot, modifier - dot);
  334           buf [modifier - dot] = '\0';
  335           return buf;
  336         }
  337         }
  338     }
  339 
  340       /* Woe32 has a function returning the locale's codepage as a number.  */
  341       sprintf (buf, "CP%u", GetACP ());
  342       codeset = buf;
  343     }
  344 #  endif
  345 
  346 # else
  347 
  348   /* On old systems which lack it, use setlocale or getenv.  */
  349   const char *locale = NULL;
  350 
  351   /* But most old systems don't have a complete set of locales.  Some
  352      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
  353      use setlocale here; it would return "C" when it doesn't support the
  354      locale name the user has set.  */
  355 #  if 0
  356   locale = setlocale (LC_CTYPE, NULL);
  357 #  endif
  358   if (locale == NULL || locale[0] == '\0')
  359     {
  360       locale = getenv ("LC_ALL");
  361       if (locale == NULL || locale[0] == '\0')
  362     {
  363       locale = getenv ("LC_CTYPE");
  364       if (locale == NULL || locale[0] == '\0')
  365         locale = getenv ("LANG");
  366     }
  367     }
  368 
  369   /* On some old systems, one used to set locale = "iso8859_1". On others,
  370      you set it to "language_COUNTRY.charset". In any case, we resolve it
  371      through the charset.alias file.  */
  372   codeset = locale;
  373 
  374 # endif
  375 
  376 #elif defined WIN32_NATIVE
  377 
  378   static char buf[2 + 10 + 1];
  379 
  380   /* Woe32 has a function returning the locale's codepage as a number.  */
  381   sprintf (buf, "CP%u", GetACP ());
  382   codeset = buf;
  383 
  384 #elif defined OS2
  385 
  386   const char *locale;
  387   static char buf[2 + 10 + 1];
  388   ULONG cp[3];
  389   ULONG cplen;
  390 
  391   /* Allow user to override the codeset, as set in the operating system,
  392      with standard language environment variables.  */
  393   locale = getenv ("LC_ALL");
  394   if (locale == NULL || locale[0] == '\0')
  395     {
  396       locale = getenv ("LC_CTYPE");
  397       if (locale == NULL || locale[0] == '\0')
  398     locale = getenv ("LANG");
  399     }
  400   if (locale != NULL && locale[0] != '\0')
  401     {
  402       /* If the locale name contains an encoding after the dot, return it.  */
  403       const char *dot = strchr (locale, '.');
  404 
  405       if (dot != NULL)
  406     {
  407       const char *modifier;
  408 
  409       dot++;
  410       /* Look for the possible @... trailer and remove it, if any.  */
  411       modifier = strchr (dot, '@');
  412       if (modifier == NULL)
  413         return dot;
  414       if (modifier - dot < sizeof (buf))
  415         {
  416           memcpy (buf, dot, modifier - dot);
  417           buf [modifier - dot] = '\0';
  418           return buf;
  419         }
  420     }
  421 
  422       /* Resolve through the charset.alias file.  */
  423       codeset = locale;
  424     }
  425   else
  426     {
  427       /* OS/2 has a function returning the locale's codepage as a number.  */
  428       if (DosQueryCp (sizeof (cp), cp, &cplen))
  429     codeset = "";
  430       else
  431     {
  432       sprintf (buf, "CP%u", cp[0]);
  433       codeset = buf;
  434     }
  435     }
  436 
  437 #endif
  438 
  439   if (codeset == NULL)
  440     /* The canonical name cannot be determined.  */
  441     codeset = "";
  442 
  443   /* Resolve alias. */
  444   for (aliases = get_charset_aliases ();
  445        *aliases != '\0';
  446        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
  447     if (strcmp (codeset, aliases) == 0
  448     || (aliases[0] == '*' && aliases[1] == '\0'))
  449       {
  450     codeset = aliases + strlen (aliases) + 1;
  451     break;
  452       }
  453 
  454   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
  455      the empty string as denoting "the locale's character encoding",
  456      thus GNU libiconv would call this function a second time.  */
  457   if (codeset[0] == '\0')
  458     codeset = "ASCII";
  459 
  460   return codeset;
  461 }