"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/rxp/charset.c" (4 Sep 2017, 11771 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "charset.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
    4 /* University of Edinburgh.                                              */
    5 /*                                                                       */
    6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,     */
    7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
    8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
    9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
   10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF    */
   11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION    */
   12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.       */
   13 /*                                                                       */
   14 /*************************************************************************/
   15 #include <stdio.h>
   16 #include <stdlib.h>
   17 
   18 #ifdef FOR_LT
   19 
   20 #include "lt-memory.h"
   21 
   22 #define Malloc salloc
   23 
   24 #else
   25 
   26 #include "system.h"
   27 
   28 #endif
   29 
   30 #include "charset.h"
   31 #include "string16.h"
   32 
   33 int iso_to_unicode[8][256];     /* latin-2 ... latin-9 */
   34 int iso_max_val[8];
   35 char8 *unicode_to_iso[8];
   36 
   37 /* This table is used to initialise the above arrays */
   38 
   39 static int latin_table[8][96] = {
   40 
   41 /* latin2 */
   42 {
   43 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
   44 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
   45 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
   46 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
   47 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
   48 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
   49 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
   50 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
   51 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
   52 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
   53 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
   54 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
   55 },
   56 
   57 /* latin3 */
   58 {
   59 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -00001, 0x0124, 0x00a7,
   60 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -00001, 0x017b,
   61 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,
   62 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -00001, 0x017c,
   63 0x00c0, 0x00c1, 0x00c2, -00001, 0x00c4, 0x010a, 0x0108, 0x00c7,
   64 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
   65 -00001, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,
   66 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
   67 0x00e0, 0x00e1, 0x00e2, -00001, 0x00e4, 0x010b, 0x0109, 0x00e7,
   68 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
   69 -00001, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,
   70 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,
   71 },
   72 
   73 /* latin4 */
   74 {
   75 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,
   76 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
   77 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,
   78 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
   79 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,
   80 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
   81 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
   82 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
   83 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,
   84 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
   85 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
   86 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,
   87 },
   88 
   89 /* latin5 */
   90 {
   91 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
   92 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f,
   93 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
   94 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
   95 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
   96 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
   97 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
   98 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
   99 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
  100 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
  101 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
  102 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f,
  103 },
  104 
  105 /* latin6 */
  106 {
  107 0x00a0, -00001, -00001, -00001, 0x00a4, -00001, -00001, -00001,
  108 -00001, -00001, -00001, -00001, 0x060c, 0x00ad, -00001, -00001,
  109 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
  110 -00001, -00001, -00001, 0x061b, -00001, -00001, -00001, 0x061f,
  111 -00001, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
  112 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,
  113 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
  114 0x0638, 0x0639, 0x063a, -00001, -00001, -00001, -00001, -00001,
  115 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
  116 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,
  117 0x0650, 0x0651, 0x0652, -00001, -00001, -00001, -00001, -00001,
  118 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
  119 },
  120 
  121 /* latin7 */
  122 {
  123 0x00a0, 0x02bd, 0x02bc, 0x00a3, -00001, -00001, 0x00a6, 0x00a7,
  124 0x00a8, 0x00a9, -00001, 0x00ab, 0x00ac, 0x00ad, -00001, 0x2015,
  125 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,
  126 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
  127 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
  128 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
  129 0x03a0, 0x03a1, -00001, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
  130 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
  131 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
  132 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
  133 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
  134 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, -00001,
  135 },
  136 
  137 /* latin8 */
  138 {
  139 0x00a0, -00001, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
  140 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e,
  141 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
  142 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, -00001,
  143 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
  144 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
  145 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
  146 -00001, -00001, -00001, -00001, -00001, -00001, -00001, 0x2017,
  147 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,
  148 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,
  149 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,
  150 0x05e8, 0x05e9, 0x05ea, -00001, -00001, -00001, -00001, -00001,
  151 },
  152 
  153 /* latin9 */
  154 {
  155 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
  156 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
  157 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
  158 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
  159 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
  160 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
  161 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
  162 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,
  163 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
  164 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
  165 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
  166 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
  167 }
  168 };
  169 
  170 const char8 *CharacterEncodingName[CE_enum_count] = {
  171     "unknown",
  172     "unspecified-ascii-superset",
  173 
  174     "UTF-8",
  175     "ISO-646",
  176 
  177     "ISO-8859-1",
  178     "ISO-8859-2",
  179     "ISO-8859-3",
  180     "ISO-8859-4",
  181     "ISO-8859-5",
  182     "ISO-8859-6",
  183     "ISO-8859-7",
  184     "ISO-8859-8",
  185     "ISO-8859-9",
  186 
  187     "UTF-16",
  188     "UTF-16",
  189     "ISO-10646-UCS-2",
  190     "ISO-10646-UCS-2",
  191 };
  192 
  193 const char8 *CharacterEncodingNameAndByteOrder[CE_enum_count] = {
  194     "unknown",
  195     "unspecified_ascii_superset",
  196 
  197     "UTF-8",
  198     "ISO-646",
  199 
  200     "ISO-8859-1",
  201     "ISO-8859-2",
  202     "ISO-8859-3",
  203     "ISO-8859-4",
  204     "ISO-8859-5",
  205     "ISO-8859-6",
  206     "ISO-8859-7",
  207     "ISO-8859-8",
  208     "ISO-8859-9",
  209 
  210     "UTF-16-B",
  211     "UTF-16-L",
  212     "ISO-10646-UCS-2-B",
  213     "ISO-10646-UCS-2-L",
  214 };
  215 
  216 struct character_encoding_alias CharacterEncodingAlias[] = {
  217     {"ASCII", CE_ISO_646},
  218     {"ISO-Latin-1", CE_ISO_8859_1},
  219     {"ISO-Latin-2", CE_ISO_8859_2},
  220     {"ISO-Latin-3", CE_ISO_8859_3},
  221     {"ISO-Latin-4", CE_ISO_8859_4},
  222     {"ISO-Latin-5", CE_ISO_8859_5},
  223     {"ISO-Latin-6", CE_ISO_8859_6},
  224     {"ISO-Latin-7", CE_ISO_8859_7},
  225     {"ISO-Latin-8", CE_ISO_8859_8},
  226     {"UCS-2", CE_ISO_10646_UCS_2B},
  227 };
  228 const int CE_alias_count =
  229     sizeof(CharacterEncodingAlias)/sizeof(CharacterEncodingAlias[0]);
  230 
  231 CharacterEncoding InternalCharacterEncoding;
  232 
  233 void init_charset(void)
  234 {
  235     int i, j;
  236 
  237     /* Determine internal encoding */
  238 
  239 #if CHAR_SIZE == 8
  240     InternalCharacterEncoding = CE_unspecified_ascii_superset;
  241 #else
  242     union {char b[2]; short s;} bytes;
  243     bytes.s = 1;
  244 
  245     InternalCharacterEncoding = (bytes.b[0] == 0) ? CE_UTF_16B : CE_UTF_16L;
  246 #endif
  247 
  248     /* Make ISO-Latin-N tables */
  249 
  250     for(i=0; i<8; i++)
  251     {
  252     int max = 0x9f;
  253 
  254     for(j=0; j<0xa0; j++)
  255         iso_to_unicode[i][j] = j;
  256     for(j=0xa0; j<0x100; j++)
  257     {
  258         int code = latin_table[i][j-0xa0];
  259         iso_to_unicode[i][j] = code;
  260         if(code > max) max = code;
  261     }
  262 
  263     iso_max_val[i] = max;
  264 
  265     if(!(unicode_to_iso[i] = Malloc(max+1)))
  266     {
  267         fprintf(stderr, "Malloc failed in charset initialisation\n");
  268         exit(1);
  269     }
  270 
  271     for(j=0; j<0xa0; j++)
  272         unicode_to_iso[i][j] = j;
  273     for(j=0xa0; j<=max; j++)
  274         unicode_to_iso[i][j] = '?';
  275     for(j=0xa0; j<0x100; j++)
  276     {
  277         int code = latin_table[i][j-0xa0];
  278         if(code != -1)
  279         unicode_to_iso[i][code] = j;
  280     }
  281     }
  282 }
  283 
  284 /* Return true if the encoding has 8-bit input units and is the same
  285    as ascii for characters <= 127 */
  286 
  287 int EncodingIsAsciiSuperset(CharacterEncoding enc)
  288 {
  289     return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
  290 }
  291 
  292 /* 
  293  * Return true if enc1 and enc2 have the same size input units, and are
  294  * the same for Unicode <= 127.
  295  * If so, *enc3 is set to enc2 modified to have the same byte order as enc1.
  296  */
  297 
  298 int EncodingsCompatible(CharacterEncoding enc1, CharacterEncoding enc2,
  299             CharacterEncoding *enc3)
  300 {
  301     if(EncodingIsAsciiSuperset(enc1))
  302     {
  303     if(EncodingIsAsciiSuperset(enc2))
  304     {
  305         *enc3 = enc2;
  306         return 1;
  307     }
  308     return 0;
  309     }
  310 
  311     if(enc1 == CE_UTF_16B || enc1 == CE_ISO_10646_UCS_2B)
  312     {
  313     if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
  314         *enc3 = CE_UTF_16B;
  315     else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
  316         *enc3 = CE_ISO_10646_UCS_2B;
  317     else
  318         return 0;
  319     return 1;
  320     }
  321 
  322     if(enc1 == CE_UTF_16L || enc1 == CE_ISO_10646_UCS_2L)
  323     {
  324     if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
  325         *enc3 = CE_UTF_16L;
  326     else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
  327         *enc3 = CE_ISO_10646_UCS_2L;
  328     else
  329         return 0;
  330     return 1;
  331     }
  332 
  333     return 0;
  334 }
  335 
  336 CharacterEncoding FindEncoding(char8 *name)
  337 {
  338     int i;
  339 
  340     for(i=0; i<CE_enum_count; i++)
  341     if(strcasecmp8(name, CharacterEncodingNameAndByteOrder[i]) == 0)
  342         return (CharacterEncoding)i;
  343 
  344     for(i=0; i<CE_enum_count; i++)
  345     if(strcasecmp8(name, CharacterEncodingName[i]) == 0)
  346         return (CharacterEncoding)i;
  347 
  348     for(i=0; i<CE_alias_count; i++)
  349     if(strcasecmp8(name, CharacterEncodingAlias[i].name) == 0)
  350         return CharacterEncodingAlias[i].enc;
  351 
  352     return CE_unknown;
  353 }
  354