"Fossies" - the Fresh Open Source Software Archive 
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 #include <stdio.h>
16 #include <stdlib.h>
17
18 #ifdef FOR_LT
19
20 #include "lt-memory.h"
21
22 #define Malloc salloc
23
24 #else
25
26 #include "system.h"
27
28 #endif
29
30 #include "charset.h"
31 #include "string16.h"
32
33 int iso_to_unicode[8][256]; /* latin-2 ... latin-9 */
34 int iso_max_val[8];
35 char8 *unicode_to_iso[8];
36
37 /* This table is used to initialise the above arrays */
38
39 static int latin_table[8][96] = {
40
41 /* latin2 */
42 {
43 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
44 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
45 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
46 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
47 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
48 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
49 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
50 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
51 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
52 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
53 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
54 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55 },
56
57 /* latin3 */
58 {
59 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -00001, 0x0124, 0x00a7,
60 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -00001, 0x017b,
61 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,
62 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -00001, 0x017c,
63 0x00c0, 0x00c1, 0x00c2, -00001, 0x00c4, 0x010a, 0x0108, 0x00c7,
64 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
65 -00001, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,
66 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
67 0x00e0, 0x00e1, 0x00e2, -00001, 0x00e4, 0x010b, 0x0109, 0x00e7,
68 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
69 -00001, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,
70 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,
71 },
72
73 /* latin4 */
74 {
75 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,
76 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
77 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,
78 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
79 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,
80 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
81 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
82 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
83 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,
84 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
85 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
86 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,
87 },
88
89 /* latin5 */
90 {
91 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
92 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f,
93 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
94 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
95 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
96 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
97 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
98 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
99 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
100 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
101 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
102 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f,
103 },
104
105 /* latin6 */
106 {
107 0x00a0, -00001, -00001, -00001, 0x00a4, -00001, -00001, -00001,
108 -00001, -00001, -00001, -00001, 0x060c, 0x00ad, -00001, -00001,
109 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
110 -00001, -00001, -00001, 0x061b, -00001, -00001, -00001, 0x061f,
111 -00001, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
112 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,
113 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
114 0x0638, 0x0639, 0x063a, -00001, -00001, -00001, -00001, -00001,
115 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
116 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,
117 0x0650, 0x0651, 0x0652, -00001, -00001, -00001, -00001, -00001,
118 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
119 },
120
121 /* latin7 */
122 {
123 0x00a0, 0x02bd, 0x02bc, 0x00a3, -00001, -00001, 0x00a6, 0x00a7,
124 0x00a8, 0x00a9, -00001, 0x00ab, 0x00ac, 0x00ad, -00001, 0x2015,
125 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,
126 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
127 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
128 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
129 0x03a0, 0x03a1, -00001, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
130 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
131 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
132 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
133 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
134 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, -00001,
135 },
136
137 /* latin8 */
138 {
139 0x00a0, -00001, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
140 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e,
141 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
142 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, -00001,
143 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
144 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
145 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
146 -00001, -00001, -00001, -00001, -00001, -00001, -00001, 0x2017,
147 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,
148 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,
149 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,
150 0x05e8, 0x05e9, 0x05ea, -00001, -00001, -00001, -00001, -00001,
151 },
152
153 /* latin9 */
154 {
155 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
156 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
157 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
158 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
159 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
160 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
161 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
162 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,
163 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
164 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
165 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
166 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
167 }
168 };
169
170 const char8 *CharacterEncodingName[CE_enum_count] = {
171 "unknown",
172 "unspecified-ascii-superset",
173
174 "UTF-8",
175 "ISO-646",
176
177 "ISO-8859-1",
178 "ISO-8859-2",
179 "ISO-8859-3",
180 "ISO-8859-4",
181 "ISO-8859-5",
182 "ISO-8859-6",
183 "ISO-8859-7",
184 "ISO-8859-8",
185 "ISO-8859-9",
186
187 "UTF-16",
188 "UTF-16",
189 "ISO-10646-UCS-2",
190 "ISO-10646-UCS-2",
191 };
192
193 const char8 *CharacterEncodingNameAndByteOrder[CE_enum_count] = {
194 "unknown",
195 "unspecified_ascii_superset",
196
197 "UTF-8",
198 "ISO-646",
199
200 "ISO-8859-1",
201 "ISO-8859-2",
202 "ISO-8859-3",
203 "ISO-8859-4",
204 "ISO-8859-5",
205 "ISO-8859-6",
206 "ISO-8859-7",
207 "ISO-8859-8",
208 "ISO-8859-9",
209
210 "UTF-16-B",
211 "UTF-16-L",
212 "ISO-10646-UCS-2-B",
213 "ISO-10646-UCS-2-L",
214 };
215
216 struct character_encoding_alias CharacterEncodingAlias[] = {
217 {"ASCII", CE_ISO_646},
218 {"ISO-Latin-1", CE_ISO_8859_1},
219 {"ISO-Latin-2", CE_ISO_8859_2},
220 {"ISO-Latin-3", CE_ISO_8859_3},
221 {"ISO-Latin-4", CE_ISO_8859_4},
222 {"ISO-Latin-5", CE_ISO_8859_5},
223 {"ISO-Latin-6", CE_ISO_8859_6},
224 {"ISO-Latin-7", CE_ISO_8859_7},
225 {"ISO-Latin-8", CE_ISO_8859_8},
226 {"UCS-2", CE_ISO_10646_UCS_2B},
227 };
228 const int CE_alias_count =
229 sizeof(CharacterEncodingAlias)/sizeof(CharacterEncodingAlias[0]);
230
231 CharacterEncoding InternalCharacterEncoding;
232
233 void init_charset(void)
234 {
235 int i, j;
236
237 /* Determine internal encoding */
238
239 #if CHAR_SIZE == 8
240 InternalCharacterEncoding = CE_unspecified_ascii_superset;
241 #else
242 union {char b[2]; short s;} bytes;
243 bytes.s = 1;
244
245 InternalCharacterEncoding = (bytes.b[0] == 0) ? CE_UTF_16B : CE_UTF_16L;
246 #endif
247
248 /* Make ISO-Latin-N tables */
249
250 for(i=0; i<8; i++)
251 {
252 int max = 0x9f;
253
254 for(j=0; j<0xa0; j++)
255 iso_to_unicode[i][j] = j;
256 for(j=0xa0; j<0x100; j++)
257 {
258 int code = latin_table[i][j-0xa0];
259 iso_to_unicode[i][j] = code;
260 if(code > max) max = code;
261 }
262
263 iso_max_val[i] = max;
264
265 if(!(unicode_to_iso[i] = Malloc(max+1)))
266 {
267 fprintf(stderr, "Malloc failed in charset initialisation\n");
268 exit(1);
269 }
270
271 for(j=0; j<0xa0; j++)
272 unicode_to_iso[i][j] = j;
273 for(j=0xa0; j<=max; j++)
274 unicode_to_iso[i][j] = '?';
275 for(j=0xa0; j<0x100; j++)
276 {
277 int code = latin_table[i][j-0xa0];
278 if(code != -1)
279 unicode_to_iso[i][code] = j;
280 }
281 }
282 }
283
284 /* Return true if the encoding has 8-bit input units and is the same
285 as ascii for characters <= 127 */
286
287 int EncodingIsAsciiSuperset(CharacterEncoding enc)
288 {
289 return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
290 }
291
292 /*
293 * Return true if enc1 and enc2 have the same size input units, and are
294 * the same for Unicode <= 127.
295 * If so, *enc3 is set to enc2 modified to have the same byte order as enc1.
296 */
297
298 int EncodingsCompatible(CharacterEncoding enc1, CharacterEncoding enc2,
299 CharacterEncoding *enc3)
300 {
301 if(EncodingIsAsciiSuperset(enc1))
302 {
303 if(EncodingIsAsciiSuperset(enc2))
304 {
305 *enc3 = enc2;
306 return 1;
307 }
308 return 0;
309 }
310
311 if(enc1 == CE_UTF_16B || enc1 == CE_ISO_10646_UCS_2B)
312 {
313 if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
314 *enc3 = CE_UTF_16B;
315 else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
316 *enc3 = CE_ISO_10646_UCS_2B;
317 else
318 return 0;
319 return 1;
320 }
321
322 if(enc1 == CE_UTF_16L || enc1 == CE_ISO_10646_UCS_2L)
323 {
324 if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
325 *enc3 = CE_UTF_16L;
326 else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
327 *enc3 = CE_ISO_10646_UCS_2L;
328 else
329 return 0;
330 return 1;
331 }
332
333 return 0;
334 }
335
336 CharacterEncoding FindEncoding(char8 *name)
337 {
338 int i;
339
340 for(i=0; i<CE_enum_count; i++)
341 if(strcasecmp8(name, CharacterEncodingNameAndByteOrder[i]) == 0)
342 return (CharacterEncoding)i;
343
344 for(i=0; i<CE_enum_count; i++)
345 if(strcasecmp8(name, CharacterEncodingName[i]) == 0)
346 return (CharacterEncoding)i;
347
348 for(i=0; i<CE_alias_count; i++)
349 if(strcasecmp8(name, CharacterEncodingAlias[i].name) == 0)
350 return CharacterEncodingAlias[i].enc;
351
352 return CE_unknown;
353 }
354