w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

ucnvmbcs.cpp
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2000-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: ucnvmbcs.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2000jul03
16 * created by: Markus W. Scherer
17 *
18 * The current code in this file replaces the previous implementation
19 * of conversion code from multi-byte codepages to Unicode and back.
20 * This implementation supports the following:
21 * - legacy variable-length codepages with up to 4 bytes per character
22 * - all Unicode code points (up to 0x10ffff)
23 * - efficient distinction of unassigned vs. illegal byte sequences
24 * - it is possible in fromUnicode() to directly deal with simple
25 * stateful encodings (used for EBCDIC_STATEFUL)
26 * - it is possible to convert Unicode code points
27 * to a single zero byte (but not as a fallback except for SBCS)
28 *
29 * Remaining limitations in fromUnicode:
30 * - byte sequences must not have leading zero bytes
31 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
32 * - limitation to up to 4 bytes per character
33 *
34 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
35 * limitations and adds m:n character mappings and other features.
36 * See ucnv_ext.h for details.
37 *
38 * Change history:
39 *
40 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
41 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
42 * macros to ucnvmbcs.h file
43 */
44 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
48 
49 #include "unicode/ucnv.h"
50 #include "unicode/ucnv_cb.h"
51 #include "unicode/udata.h"
52 #include "unicode/uset.h"
53 #include "unicode/utf8.h"
54 #include "unicode/utf16.h"
55 #include "ucnv_bld.h"
56 #include "ucnvmbcs.h"
57 #include "ucnv_ext.h"
58 #include "ucnv_cnv.h"
59 #include "cmemory.h"
60 #include "cstring.h"
61 #include "umutex.h"
62 #include "ustr_imp.h"
63 
64 /* control optimizations according to the platform */
65 #define MBCS_UNROLL_SINGLE_TO_BMP 1
66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
67 
68 /*
69  * _MBCSHeader versions 5.3 & 4.3
70  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
71  *
72  * This version is optional. Version 5 is used for incompatible data format changes.
73  * makeconv will continue to generate version 4 files if possible.
74  *
75  * Changes from version 4:
76  *
77  * The main difference is an additional _MBCSHeader field with
78  * - the length (number of uint32_t) of the _MBCSHeader
79  * - flags for further incompatible data format changes
80  * - flags for further, backward compatible data format changes
81  *
82  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
83  * the file and needs to be reconstituted at load time.
84  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
85  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
86  * (For details about these structures see below, and see ucnvmbcs.h.)
87  *
88  * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
89  * of the Unicode code points. (This requires that the .ucm file has the |0 etc.
90  * precision markers for all mappings.)
91  *
92  * All fallbacks have been moved to the extension table, leaving only roundtrips in the
93  * omitted data that can be reconstituted from the toUnicode data.
94  *
95  * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
96  * With only roundtrip mappings in the base fromUnicode data, this part is fully
97  * redundant with the mbcsIndex and will be reconstituted from that (also using the
98  * stage 1 table which contains the information about how stage 2 was compacted).
99  *
100  * The rest of the stage 2 table, the part for code points above maxFastUChar,
101  * is stored in the file and will be appended to the reconstituted part.
102  *
103  * The entire fromUBytes array is omitted from the file and will be reconstitued.
104  * This is done by enumerating all toUnicode roundtrip mappings, performing
105  * each mapping (using the stage 1 and reconstituted stage 2 tables) and
106  * writing instead of reading the byte values.
107  *
108  * _MBCSHeader version 4.3
109  *
110  * Change from version 4.2:
111  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
112  * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
113  * files which can be used instead of stages 1 & 2.
114  * Faster lookups for roundtrips from most commonly used characters,
115  * and lookups from UTF-8 byte sequences with a natural bit distribution.
116  * See ucnvmbcs.h for more details.
117  *
118  * Change from version 4.1:
119  * - Added an optional extension table structure at the end of the .cnv file.
120  * It is present if the upper bits of the header flags field contains a non-zero
121  * byte offset to it.
122  * Files that contain only a conversion table and no base table
123  * use the special outputType MBCS_OUTPUT_EXT_ONLY.
124  * These contain the base table name between the MBCS header and the extension
125  * data.
126  *
127  * Change from version 4.0:
128  * - Replace header.reserved with header.fromUBytesLength so that all
129  * fields in the data have length.
130  *
131  * Changes from version 3 (for performance improvements):
132  * - new bit distribution for state table entries
133  * - reordered action codes
134  * - new data structure for single-byte fromUnicode
135  * + stage 2 only contains indexes
136  * + stage 3 stores 16 bits per character with classification bits 15..8
137  * - no multiplier for stage 1 entries
138  * - stage 2 for non-single-byte codepages contains the index and the flags in
139  * one 32-bit value
140  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
141  *
142  * For more details about old versions of the MBCS data structure, see
143  * the corresponding versions of this file.
144  *
145  * Converting stateless codepage data ---------------------------------------***
146  * (or codepage data with simple states) to Unicode.
147  *
148  * Data structure and algorithm for converting from complex legacy codepages
149  * to Unicode. (Designed before 2000-may-22.)
150  *
151  * The basic idea is that the structure of legacy codepages can be described
152  * with state tables.
153  * When reading a byte stream, each input byte causes a state transition.
154  * Some transitions result in the output of a code point, some result in
155  * "unassigned" or "illegal" output.
156  * This is used here for character conversion.
157  *
158  * The data structure begins with a state table consisting of a row
159  * per state, with 256 entries (columns) per row for each possible input
160  * byte value.
161  * Each entry is 32 bits wide, with two formats distinguished by
162  * the sign bit (bit 31):
163  *
164  * One format for transitional entries (bit 31 not set) for non-final bytes, and
165  * one format for final entries (bit 31 set).
166  * Both formats contain the number of the next state in the same bit
167  * positions.
168  * State 0 is the initial state.
169  *
170  * Most of the time, the offset values of subsequent states are added
171  * up to a scalar value. This value will eventually be the index of
172  * the Unicode code point in a table that follows the state table.
173  * The effect is that the code points for final state table rows
174  * are contiguous. The code points of final state rows follow each other
175  * in the order of the references to those final states by previous
176  * states, etc.
177  *
178  * For some terminal states, the offset is itself the output Unicode
179  * code point (16 bits for a BMP code point or 20 bits for a supplementary
180  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
181  * For others, the code point in the Unicode table is stored with either
182  * one or two code units: one for BMP code points, two for a pair of
183  * surrogates.
184  * All code points for a final state entry take up the same number of code
185  * units, regardless of whether they all actually _use_ the same number
186  * of code units. This is necessary for simple array access.
187  *
188  * An additional feature comes in with what in ICU is called "fallback"
189  * mappings:
190  *
191  * In addition to round-trippable, precise, 1:1 mappings, there are often
192  * mappings defined between similar, though not the same, characters.
193  * Typically, such mappings occur only in fromUnicode mapping tables because
194  * Unicode has a superset repertoire of most other codepages. However, it
195  * is possible to provide such mappings in the toUnicode tables, too.
196  * In this case, the fallback mappings are partly integrated into the
197  * general state tables because the structure of the encoding includes their
198  * byte sequences.
199  * For final entries in an initial state, fallback mappings are stored in
200  * the entry itself like with roundtrip mappings.
201  * For other final entries, they are stored in the code units table if
202  * the entry is for a pair of code units.
203  * For single-unit results in the code units table, there is no space to
204  * alternatively hold a fallback mapping; in this case, the code unit
205  * is stored as U+fffe (unassigned), and the fallback mapping needs to
206  * be looked up by the scalar offset value in a separate table.
207  *
208  * "Unassigned" state entries really mean "structurally unassigned",
209  * i.e., such a byte sequence will never have a mapping result.
210  *
211  * The interpretation of the bits in each entry is as follows:
212  *
213  * Bit 31 not set, not a terminal entry ("transitional"):
214  * 30..24 next state
215  * 23..0 offset delta, to be added up
216  *
217  * Bit 31 set, terminal ("final") entry:
218  * 30..24 next state (regardless of action code)
219  * 23..20 action code:
220  * action codes 0 and 1 result in precise-mapping Unicode code points
221  * 0 valid byte sequence
222  * 19..16 not used, 0
223  * 15..0 16-bit Unicode BMP code point
224  * never U+fffe or U+ffff
225  * 1 valid byte sequence
226  * 19..0 20-bit Unicode supplementary code point
227  * never U+fffe or U+ffff
228  *
229  * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
230  * 2 valid byte sequence (fallback)
231  * 19..16 not used, 0
232  * 15..0 16-bit Unicode BMP code point as fallback result
233  * 3 valid byte sequence (fallback)
234  * 19..0 20-bit Unicode supplementary code point as fallback result
235  *
236  * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
237  * depending on the code units they result in
238  * 4 valid byte sequence
239  * 19..9 not used, 0
240  * 8..0 final offset delta
241  * pointing to one 16-bit code unit which may be
242  * fffe unassigned -- look for a fallback for this offset
243  * ffff illegal
244  * 5 valid byte sequence
245  * 19..9 not used, 0
246  * 8..0 final offset delta
247  * pointing to two 16-bit code units
248  * (typically UTF-16 surrogates)
249  * the result depends on the first code unit as follows:
250  * 0000..d7ff roundtrip BMP code point (1st alone)
251  * d800..dbff roundtrip surrogate pair (1st, 2nd)
252  * dc00..dfff fallback surrogate pair (1st-400, 2nd)
253  * e000 roundtrip BMP code point (2nd alone)
254  * e001 fallback BMP code point (2nd alone)
255  * fffe unassigned
256  * ffff illegal
257  * (the final offset deltas are at most 255 * 2,
258  * times 2 because of storing code unit pairs)
259  *
260  * 6 unassigned byte sequence
261  * 19..16 not used, 0
262  * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
263  * this does not contain a final offset delta because the main
264  * purpose of this action code is to save scalar offset values;
265  * therefore, fallback values cannot be assigned to byte
266  * sequences that result in this action code
267  * 7 illegal byte sequence
268  * 19..16 not used, 0
269  * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
270  * 8 state change only
271  * 19..0 not used, 0
272  * useful for state changes in simple stateful encodings,
273  * at Shift-In/Shift-Out codes
274  *
275  *
276  * 9..15 reserved for future use
277  * current implementations will only perform a state change
278  * and ignore bits 19..0
279  *
280  * An encoding with contiguous ranges of unassigned byte sequences, like
281  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
282  * at least two states for the trail bytes:
283  * One trail byte state that results in code points, and one that only
284  * has "unassigned" and "illegal" terminal states.
285  *
286  * Note: partly by accident, this data structure supports simple stateful
287  * encodings without any additional logic.
288  * Currently, only simple Shift-In/Shift-Out schemes are handled with
289  * appropriate state tables (especially EBCDIC_STATEFUL!).
290  *
291  * MBCS version 2 added:
292  * unassigned and illegal action codes have U+fffe and U+ffff
293  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
294  *
295  * Converting from Unicode to codepage bytes --------------------------------***
296  *
297  * The conversion data structure for fromUnicode is designed for the known
298  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
299  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
300  * a roundtrip mapping.
301  *
302  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
303  * like in the character properties table.
304  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
305  * with the resulting bytes is at offsetFromUBytes.
306  *
307  * Beginning with version 4, single-byte codepages have a significantly different
308  * trie compared to other codepages.
309  * In all cases, the entry in stage 1 is directly the index of the block of
310  * 64 entries in stage 2.
311  *
312  * Single-byte lookup:
313  *
314  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
315  * Stage 3 contains one 16-bit word per result:
316  * Bits 15..8 indicate the kind of result:
317  * f roundtrip result
318  * c fallback result from private-use code point
319  * 8 fallback result from other code points
320  * 0 unassigned
321  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
322  *
323  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
324  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
325  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
326  * ASCII code points can be looked up with a linear array access into stage 3.
327  * See maxFastUChar and other details in ucnvmbcs.h.
328  *
329  * Multi-byte lookup:
330  *
331  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
332  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
333  * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
334  * If this test is false, then a non-zero result will be interpreted as
335  * a fallback mapping.
336  * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
337  *
338  * Stage 3 contains 2, 3, or 4 bytes per result.
339  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
340  * while 3 bytes are stored as bytes in big-endian order.
341  * Leading zero bytes are ignored, and the number of bytes is counted.
342  * A zero byte mapping result is possible as a roundtrip result.
343  * For some output types, the actual result is processed from this;
344  * see ucnv_MBCSFromUnicodeWithOffsets().
345  *
346  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
347  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
348  *
349  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
350  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
351  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
352  * ASCII code points can be looked up with a linear array access into stage 3.
353  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
354  *
355  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
356  * for compaction.
357  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
358  * may overlap by any number of entries.
359  *
360  * MBCS version 2 added:
361  * the converter checks for known output types, which allows
362  * adding new ones without crashing an unaware converter
363  */
364 
365 /**
366  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
367  * consecutive sequences of bytes, starting from the one encoded in value,
368  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
369  * Does not currently support m:n mappings or reverse fallbacks.
370  * This function will not be called for sequences of bytes with leading zeros.
371  *
372  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
373  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
374  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
375  * not map to anything
376  * @return TRUE to continue enumeration, FALSE to stop
377  */
378 typedef UBool U_CALLCONV
380 
381 static void U_CALLCONV
383  UConverterLoadArgs *pArgs,
384  const uint8_t *raw,
385  UErrorCode *pErrorCode);
386 
387 static void U_CALLCONV
389 
390 static void U_CALLCONV
392  UConverterLoadArgs *pArgs,
393  UErrorCode *pErrorCode);
394 
395 static UChar32 U_CALLCONV
397  UErrorCode *pErrorCode);
398 
399 static void U_CALLCONV
401  UBool starters[256],
402  UErrorCode *pErrorCode);
403 
405 static const char* U_CALLCONV
408 
409 static void U_CALLCONV
411  int32_t offsetIndex,
412  UErrorCode *pErrorCode);
413 
414 static UChar32 U_CALLCONV
416  UErrorCode *pErrorCode);
417 
418 static void U_CALLCONV
420  UConverterToUnicodeArgs *pToUArgs,
421  UErrorCode *pErrorCode);
422 
423 static void U_CALLCONV
425  const USetAdder *sa,
426  UConverterUnicodeSet which,
427  UErrorCode *pErrorCode);
428 
429 static void U_CALLCONV
431  UConverterToUnicodeArgs *pToUArgs,
432  UErrorCode *pErrorCode);
433 
435  UCNV_MBCS,
436 
439 
441  NULL,
442  NULL,
443 
449 
453  NULL,
455 
456  NULL,
458 };
459 
461  UCNV_MBCS,
462 
465 
467  NULL,
468  NULL,
469 
475 
479  NULL,
481 
482  NULL,
484 };
485 
486 static const UConverterImpl _MBCSImpl={
487  UCNV_MBCS,
488 
491 
493  NULL,
494  NULL,
495 
501 
505  NULL,
507  NULL,
508  NULL
509 };
510 
511 /* Static data is in tools/makeconv/ucnvstat.c for data-based
512  * converters. Be sure to update it as well.
513  */
514 
516  sizeof(UConverterSharedData), 1,
519 };
520 
521 
522 /* GB 18030 data ------------------------------------------------------------ */
523 
524 /* helper macros for linear values for GB 18030 four-byte sequences */
525 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
526 
527 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
528 
529 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
530 
531 /*
532  * Some ranges of GB 18030 where both the Unicode code points and the
533  * GB four-byte sequences are contiguous and are handled algorithmically by
534  * the special callback functions below.
535  * The values are start & end of Unicode & GB codes.
536  *
537  * Note that single surrogates are not mapped by GB 18030
538  * as of the re-released mapping tables from 2000-nov-30.
539  */
540 static const uint32_t
542  {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
543  {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
544  {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
545  {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
546  {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
547  {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
548  {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
549  {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
550  {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
551  {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
552  {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
553  {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
554  {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
555  {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
556 };
557 
558 /* bit flag for UConverter.options indicating GB 18030 special handling */
559 #define _MBCS_OPTION_GB18030 0x8000
560 
561 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
562 #define _MBCS_OPTION_KEIS 0x01000
563 #define _MBCS_OPTION_JEF 0x02000
564 #define _MBCS_OPTION_JIPS 0x04000
565 
566 #define KEIS_SO_CHAR_1 0x0A
567 #define KEIS_SO_CHAR_2 0x42
568 #define KEIS_SI_CHAR_1 0x0A
569 #define KEIS_SI_CHAR_2 0x41
570 
571 #define JEF_SO_CHAR 0x28
572 #define JEF_SI_CHAR 0x29
573 
574 #define JIPS_SO_CHAR_1 0x1A
575 #define JIPS_SO_CHAR_2 0x70
576 #define JIPS_SI_CHAR_1 0x1A
577 #define JIPS_SI_CHAR_2 0x71
578 
580  SI,
581  SO
582 };
583 typedef enum SISO_Option SISO_Option;
584 
586  int32_t SISOLength = 0;
587 
588  switch (option) {
589  case SI:
590  if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
591  value[0] = KEIS_SI_CHAR_1;
592  value[1] = KEIS_SI_CHAR_2;
593  SISOLength = 2;
594  } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
595  value[0] = JEF_SI_CHAR;
596  SISOLength = 1;
597  } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
598  value[0] = JIPS_SI_CHAR_1;
599  value[1] = JIPS_SI_CHAR_2;
600  SISOLength = 2;
601  } else {
602  value[0] = UCNV_SI;
603  SISOLength = 1;
604  }
605  break;
606  case SO:
607  if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
608  value[0] = KEIS_SO_CHAR_1;
609  value[1] = KEIS_SO_CHAR_2;
610  SISOLength = 2;
611  } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
612  value[0] = JEF_SO_CHAR;
613  SISOLength = 1;
614  } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
615  value[0] = JIPS_SO_CHAR_1;
616  value[1] = JIPS_SO_CHAR_2;
617  SISOLength = 2;
618  } else {
619  value[0] = UCNV_SO;
620  SISOLength = 1;
621  }
622  break;
623  default:
624  /* Should never happen. */
625  break;
626  }
627 
628  return SISOLength;
629 }
630 
631 /* Miscellaneous ------------------------------------------------------------ */
632 
633 /* similar to ucnv_MBCSGetNextUChar() but recursive */
634 static UBool
635 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
637  uint32_t value,
638  UConverterEnumToUCallback *callback, const void *context,
639  UErrorCode *pErrorCode) {
640  UChar32 codePoints[32];
641  const int32_t *row;
642  const uint16_t *unicodeCodeUnits;
643  UChar32 anyCodePoints;
644  int32_t b, limit;
645 
646  row=mbcsTable->stateTable[state];
647  unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
648 
649  value<<=8;
650  anyCodePoints=-1; /* becomes non-negative if there is a mapping */
651 
652  b=(stateProps[state]&0x38)<<2;
653  if(b==0 && stateProps[state]>=0x40) {
654  /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
655  codePoints[0]=U_SENTINEL;
656  b=1;
657  }
658  limit=((stateProps[state]&7)+1)<<5;
659  while(b<limit) {
660  int32_t entry=row[b];
663  if(stateProps[nextState]>=0) {
664  /* recurse to a state with non-ignorable actions */
665  if(!enumToU(
666  mbcsTable, stateProps, nextState,
668  value|(uint32_t)b,
669  callback, context,
670  pErrorCode)) {
671  return FALSE;
672  }
673  }
674  codePoints[b&0x1f]=U_SENTINEL;
675  } else {
676  UChar32 c;
677  int32_t action;
678 
679  /*
680  * An if-else-if chain provides more reliable performance for
681  * the most common cases compared to a switch.
682  */
685  /* output BMP code point */
687  } else if(action==MBCS_STATE_VALID_16) {
689  c=unicodeCodeUnits[finalOffset];
690  if(c<0xfffe) {
691  /* output BMP code point */
692  } else {
693  c=U_SENTINEL;
694  }
695  } else if(action==MBCS_STATE_VALID_16_PAIR) {
697  c=unicodeCodeUnits[finalOffset++];
698  if(c<0xd800) {
699  /* output BMP code point below 0xd800 */
700  } else if(c<=0xdbff) {
701  /* output roundtrip or fallback supplementary code point */
702  c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
703  } else if(c==0xe000) {
704  /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
705  c=unicodeCodeUnits[finalOffset];
706  } else {
707  c=U_SENTINEL;
708  }
709  } else if(action==MBCS_STATE_VALID_DIRECT_20) {
710  /* output supplementary code point */
712  } else {
713  c=U_SENTINEL;
714  }
715 
716  codePoints[b&0x1f]=c;
717  anyCodePoints&=c;
718  }
719  if(((++b)&0x1f)==0) {
720  if(anyCodePoints>=0) {
721  if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
722  return FALSE;
723  }
724  anyCodePoints=-1;
725  }
726  }
727  }
728  return TRUE;
729 }
730 
731 /*
732  * Only called if stateProps[state]==-1.
733  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
734  * MBCS_STATE_CHANGE_ONLY.
735  */
736 static int8_t
737 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
738  const int32_t *row;
739  int32_t min, max, entry, nextState;
740 
741  row=stateTable[state];
742  stateProps[state]=0;
743 
744  /* find first non-ignorable state */
745  for(min=0;; ++min) {
746  entry=row[min];
747  nextState=MBCS_ENTRY_STATE(entry);
748  if(stateProps[nextState]==-1) {
749  getStateProp(stateTable, stateProps, nextState);
750  }
752  if(stateProps[nextState]>=0) {
753  break;
754  }
756  break;
757  }
758  if(min==0xff) {
759  stateProps[state]=-0x40; /* (int8_t)0xc0 */
760  return stateProps[state];
761  }
762  }
763  stateProps[state]|=(int8_t)((min>>5)<<3);
764 
765  /* find last non-ignorable state */
766  for(max=0xff; min<max; --max) {
767  entry=row[max];
768  nextState=MBCS_ENTRY_STATE(entry);
769  if(stateProps[nextState]==-1) {
770  getStateProp(stateTable, stateProps, nextState);
771  }
773  if(stateProps[nextState]>=0) {
774  break;
775  }
777  break;
778  }
779  }
780  stateProps[state]|=(int8_t)(max>>5);
781 
782  /* recurse further and collect direct-state information */
783  while(min<=max) {
784  entry=row[min];
785  nextState=MBCS_ENTRY_STATE(entry);
786  if(stateProps[nextState]==-1) {
787  getStateProp(stateTable, stateProps, nextState);
788  }
790  stateProps[nextState]|=0x40;
792  stateProps[state]|=0x40;
793  }
794  }
795  ++min;
796  }
797  return stateProps[state];
798 }
799 
800 /*
801  * Internal function enumerating the toUnicode data of an MBCS converter.
802  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
803  * table, but could also be used for a future ucnv_getUnicodeSet() option
804  * that includes reverse fallbacks (after updating this function's implementation).
805  * Currently only handles roundtrip mappings.
806  * Does not currently handle extensions.
807  */
808 static void
810  UConverterEnumToUCallback *callback, const void *context,
811  UErrorCode *pErrorCode) {
812  /*
813  * Properties for each state, to speed up the enumeration.
814  * Ignorable actions are unassigned/illegal/state-change-only:
815  * They do not lead to mappings.
816  *
817  * Bits 7..6:
818  * 1 direct/initial state (stateful converters have multiple)
819  * 0 non-initial state with transitions or with non-ignorable result actions
820  * -1 final state with only ignorable actions
821  *
822  * Bits 5..3:
823  * The lowest byte value with non-ignorable actions is
824  * value<<5 (rounded down).
825  *
826  * Bits 2..0:
827  * The highest byte value with non-ignorable actions is
828  * (value<<5)&0x1f (rounded up).
829  */
830  int8_t stateProps[MBCS_MAX_STATE_COUNT];
831  int32_t state;
832 
833  uprv_memset(stateProps, -1, sizeof(stateProps));
834 
835  /* recurse from state 0 and set all stateProps */
836  getStateProp(mbcsTable->stateTable, stateProps, 0);
837 
838  for(state=0; state<mbcsTable->countStates; ++state) {
839  /*if(stateProps[state]==-1) {
840  printf("unused/unreachable <icu:state> %d\n", state);
841  }*/
842  if(stateProps[state]>=0x40) {
843  /* start from each direct state */
844  enumToU(
845  mbcsTable, stateProps, state, 0, 0,
846  callback, context,
847  pErrorCode);
848  }
849  }
850 }
851 
852 U_CFUNC void
854  const USetAdder *sa,
855  UConverterUnicodeSet which,
857  UErrorCode *pErrorCode) {
858  const UConverterMBCSTable *mbcsTable;
859  const uint16_t *table;
860 
861  uint32_t st3;
862  uint16_t st1, maxStage1, st2;
863 
864  UChar32 c;
865 
866  /* enumerate the from-Unicode trie table */
867  mbcsTable=&sharedData->mbcs;
868  table=mbcsTable->fromUnicodeTable;
869  if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
870  maxStage1=0x440;
871  } else {
872  maxStage1=0x40;
873  }
874 
875  c=0; /* keep track of the current code point while enumerating */
876 
877  if(mbcsTable->outputType==MBCS_OUTPUT_1) {
878  const uint16_t *stage2, *stage3, *results;
879  uint16_t minValue;
880 
881  results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
882 
883  /*
884  * Set a threshold variable for selecting which mappings to use.
885  * See ucnv_MBCSSingleFromBMPWithOffsets() and
886  * MBCS_SINGLE_RESULT_FROM_U() for details.
887  */
888  if(which==UCNV_ROUNDTRIP_SET) {
889  /* use only roundtrips */
890  minValue=0xf00;
891  } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
892  /* use all roundtrip and fallback results */
893  minValue=0x800;
894  }
895 
896  for(st1=0; st1<maxStage1; ++st1) {
897  st2=table[st1];
898  if(st2>maxStage1) {
899  stage2=table+st2;
900  for(st2=0; st2<64; ++st2) {
901  if((st3=stage2[st2])!=0) {
902  /* read the stage 3 block */
903  stage3=results+st3;
904 
905  do {
906  if(*stage3++>=minValue) {
907  sa->add(sa->set, c);
908  }
909  } while((++c&0xf)!=0);
910  } else {
911  c+=16; /* empty stage 3 block */
912  }
913  }
914  } else {
915  c+=1024; /* empty stage 2 block */
916  }
917  }
918  } else {
919  const uint32_t *stage2;
920  const uint8_t *stage3, *bytes;
921  uint32_t st3Multiplier;
922  uint32_t value;
923  UBool useFallback;
924 
925  bytes=mbcsTable->fromUnicodeBytes;
926 
927  useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
928 
929  switch(mbcsTable->outputType) {
930  case MBCS_OUTPUT_3:
931  case MBCS_OUTPUT_4_EUC:
932  st3Multiplier=3;
933  break;
934  case MBCS_OUTPUT_4:
935  st3Multiplier=4;
936  break;
937  default:
938  st3Multiplier=2;
939  break;
940  }
941 
942  for(st1=0; st1<maxStage1; ++st1) {
943  st2=table[st1];
944  if(st2>(maxStage1>>1)) {
945  stage2=(const uint32_t *)table+st2;
946  for(st2=0; st2<64; ++st2) {
947  if((st3=stage2[st2])!=0) {
948  /* read the stage 3 block */
949  stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
950 
951  /* get the roundtrip flags for the stage 3 block */
952  st3>>=16;
953 
954  /*
955  * Add code points for which the roundtrip flag is set,
956  * or which map to non-zero bytes if we use fallbacks.
957  * See ucnv_MBCSFromUnicodeWithOffsets() for details.
958  */
959  switch(filter) {
961  do {
962  if(st3&1) {
963  sa->add(sa->set, c);
964  stage3+=st3Multiplier;
965  } else if(useFallback) {
966  uint8_t b=0;
967  switch(st3Multiplier) {
968  case 4:
969  b|=*stage3++;
971  case 3:
972  b|=*stage3++;
974  case 2:
975  b|=stage3[0]|stage3[1];
976  stage3+=2;
978  default:
979  break;
980  }
981  if(b!=0) {
982  sa->add(sa->set, c);
983  }
984  }
985  st3>>=1;
986  } while((++c&0xf)!=0);
987  break;
989  /* Ignore single-byte results (<0x100). */
990  do {
991  if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
992  sa->add(sa->set, c);
993  }
994  st3>>=1;
995  stage3+=2; /* +=st3Multiplier */
996  } while((++c&0xf)!=0);
997  break;
999  /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
1000  do {
1001  if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
1002  sa->add(sa->set, c);
1003  }
1004  st3>>=1;
1005  stage3+=3; /* +=st3Multiplier */
1006  } while((++c&0xf)!=0);
1007  break;
1008  case UCNV_SET_FILTER_SJIS:
1009  /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
1010  do {
1011  if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
1012  sa->add(sa->set, c);
1013  }
1014  st3>>=1;
1015  stage3+=2; /* +=st3Multiplier */
1016  } while((++c&0xf)!=0);
1017  break;
1019  /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
1020  do {
1021  if( ((st3&1)!=0 || useFallback) &&
1022  (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
1023  (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
1024  ) {
1025  sa->add(sa->set, c);
1026  }
1027  st3>>=1;
1028  stage3+=2; /* +=st3Multiplier */
1029  } while((++c&0xf)!=0);
1030  break;
1031  case UCNV_SET_FILTER_HZ:
1032  /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
1033  do {
1034  if( ((st3&1)!=0 || useFallback) &&
1035  (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
1036  (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
1037  ) {
1038  sa->add(sa->set, c);
1039  }
1040  st3>>=1;
1041  stage3+=2; /* +=st3Multiplier */
1042  } while((++c&0xf)!=0);
1043  break;
1044  default:
1045  *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1046  return;
1047  }
1048  } else {
1049  c+=16; /* empty stage 3 block */
1050  }
1051  }
1052  } else {
1053  c+=1024; /* empty stage 2 block */
1054  }
1055  }
1056  }
1057 
1058  ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
1059 }
1060 
1061 U_CFUNC void
1063  const USetAdder *sa,
1064  UConverterUnicodeSet which,
1065  UErrorCode *pErrorCode) {
1067  sharedData, sa, which,
1068  sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
1071  pErrorCode);
1072 }
1073 
1074 static void U_CALLCONV
1076  const USetAdder *sa,
1077  UConverterUnicodeSet which,
1078  UErrorCode *pErrorCode) {
1080  sa->addRange(sa->set, 0, 0xd7ff);
1081  sa->addRange(sa->set, 0xe000, 0x10ffff);
1082  } else {
1083  ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
1084  }
1085 }
1086 
1087 /* conversion extensions for input not in the main table -------------------- */
1088 
1089 /*
1090  * Hardcoded extension handling for GB 18030.
1091  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
1092  *
1093  * In the future, conversion extensions may handle m:n mappings and delta tables,
1094  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
1095  *
1096  * If an input character cannot be mapped, then these functions set an error
1097  * code. The framework will then call the callback function.
1098  */
1099 
1100 /*
1101  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
1102  * else return 0 after output has been written to the target
1103  */
1104 static UChar32
1106  UChar32 cp,
1107  const UChar **source, const UChar *sourceLimit,
1108  uint8_t **target, const uint8_t *targetLimit,
1109  int32_t **offsets, int32_t sourceIndex,
1110  UBool flush,
1111  UErrorCode *pErrorCode) {
1112  const int32_t *cx;
1113 
1115 
1116  if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1118  cnv, cx,
1119  cp, source, sourceLimit,
1120  (char **)target, (char *)targetLimit,
1121  offsets, sourceIndex,
1122  flush,
1123  pErrorCode)
1124  ) {
1125  return 0; /* an extension mapping handled the input */
1126  }
1127 
1128  /* GB 18030 */
1129  if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
1130  const uint32_t *range;
1131  int32_t i;
1132 
1133  range=gb18030Ranges[0];
1134  for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
1135  if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
1136  /* found the Unicode code point, output the four-byte sequence for it */
1137  uint32_t linear;
1138  char bytes[4];
1139 
1140  /* get the linear value of the first GB 18030 code in this range */
1142 
1143  /* add the offset from the beginning of the range */
1144  linear+=((uint32_t)cp-range[0]);
1145 
1146  /* turn this into a four-byte sequence */
1147  bytes[3]=(char)(0x30+linear%10); linear/=10;
1148  bytes[2]=(char)(0x81+linear%126); linear/=126;
1149  bytes[1]=(char)(0x30+linear%10); linear/=10;
1150  bytes[0]=(char)(0x81+linear);
1151 
1152  /* output this sequence */
1154  bytes, 4, (char **)target, (char *)targetLimit,
1155  offsets, sourceIndex, pErrorCode);
1156  return 0;
1157  }
1158  }
1159  }
1160 
1161  /* no mapping */
1162  *pErrorCode=U_INVALID_CHAR_FOUND;
1163  return cp;
1164 }
1165 
1166 /*
1167  * Input sequence: cnv->toUBytes[0..length[
1168  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1169  * else return 0 after output has been written to the target
1170  */
1171 static int8_t
1173  int8_t length,
1174  const uint8_t **source, const uint8_t *sourceLimit,
1175  UChar **target, const UChar *targetLimit,
1176  int32_t **offsets, int32_t sourceIndex,
1177  UBool flush,
1178  UErrorCode *pErrorCode) {
1179  const int32_t *cx;
1180 
1181  if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1183  cnv, cx,
1184  length, (const char **)source, (const char *)sourceLimit,
1185  target, targetLimit,
1186  offsets, sourceIndex,
1187  flush,
1188  pErrorCode)
1189  ) {
1190  return 0; /* an extension mapping handled the input */
1191  }
1192 
1193  /* GB 18030 */
1194  if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
1195  const uint32_t *range;
1196  uint32_t linear;
1197  int32_t i;
1198 
1200  range=gb18030Ranges[0];
1201  for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
1202  if(range[2]<=linear && linear<=range[3]) {
1203  /* found the sequence, output the Unicode code point for it */
1204  *pErrorCode=U_ZERO_ERROR;
1205 
1206  /* add the linear difference between the input and start sequences to the start code point */
1207  linear=range[0]+(linear-range[2]);
1208 
1209  /* output this code point */
1210  ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
1211 
1212  return 0;
1213  }
1214  }
1215  }
1216 
1217  /* no mapping */
1218  *pErrorCode=U_INVALID_CHAR_FOUND;
1219  return length;
1220 }
1221 
1222 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
1223 
1224 /*
1225  * This code modifies a standard EBCDIC<->Unicode mapping table for
1226  * OS/390 (z/OS) Unix System Services (Open Edition).
1227  * The difference is in the mapping of Line Feed and New Line control codes:
1228  * Standard EBCDIC maps
1229  *
1230  * <U000A> \x25 |0
1231  * <U0085> \x15 |0
1232  *
1233  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1234  * mapping
1235  *
1236  * <U000A> \x15 |0
1237  * <U0085> \x25 |0
1238  *
1239  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1240  * by copying it into allocated memory and swapping the LF and NL values.
1241  * It allows to support the same EBCDIC charset in both versions without
1242  * duplicating the entire installed table.
1243  */
1244 
1245 /* standard EBCDIC codes */
1246 #define EBCDIC_LF 0x25
1247 #define EBCDIC_NL 0x15
1248 
1249 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1250 #define EBCDIC_RT_LF 0xf25
1251 #define EBCDIC_RT_NL 0xf15
1252 
1253 /* Unicode code points */
1254 #define U_LF 0x0a
1255 #define U_NL 0x85
1256 
1257 static UBool
1259  UConverterMBCSTable *mbcsTable;
1260 
1261  const uint16_t *table, *results;
1262  const uint8_t *bytes;
1263 
1264  int32_t (*newStateTable)[256];
1265  uint16_t *newResults;
1266  uint8_t *p;
1267  char *name;
1268 
1269  uint32_t stage2Entry;
1270  uint32_t size, sizeofFromUBytes;
1271 
1272  mbcsTable=&sharedData->mbcs;
1273 
1274  table=mbcsTable->fromUnicodeTable;
1275  bytes=mbcsTable->fromUnicodeBytes;
1276  results=(const uint16_t *)bytes;
1277 
1278  /*
1279  * Check that this is an EBCDIC table with SBCS portion -
1280  * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1281  *
1282  * If not, ignore the option. Options are always ignored if they do not apply.
1283  */
1284  if(!(
1285  (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1288  )) {
1289  return FALSE;
1290  }
1291 
1292  if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1293  if(!(
1296  )) {
1297  return FALSE;
1298  }
1299  } else /* MBCS_OUTPUT_2_SISO */ {
1300  stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1301  if(!(
1302  MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1304  )) {
1305  return FALSE;
1306  }
1307 
1308  stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1309  if(!(
1310  MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1312  )) {
1313  return FALSE;
1314  }
1315  }
1316 
1317  if(mbcsTable->fromUBytesLength>0) {
1318  /*
1319  * We _know_ the number of bytes in the fromUnicodeBytes array
1320  * starting with header.version 4.1.
1321  */
1322  sizeofFromUBytes=mbcsTable->fromUBytesLength;
1323  } else {
1324  /*
1325  * Otherwise:
1326  * There used to be code to enumerate the fromUnicode
1327  * trie and find the highest entry, but it was removed in ICU 3.2
1328  * because it was not tested and caused a low code coverage number.
1329  * See Jitterbug 3674.
1330  * This affects only some .cnv file formats with a header.version
1331  * below 4.1, and only when swaplfnl is requested.
1332  *
1333  * ucnvmbcs.c revision 1.99 is the last one with the
1334  * ucnv_MBCSSizeofFromUBytes() function.
1335  */
1336  *pErrorCode=U_INVALID_FORMAT_ERROR;
1337  return FALSE;
1338  }
1339 
1340  /*
1341  * The table has an appropriate format.
1342  * Allocate and build
1343  * - a modified to-Unicode state table
1344  * - a modified from-Unicode output array
1345  * - a converter name string with the swap option appended
1346  */
1347  size=
1348  mbcsTable->countStates*1024+
1349  sizeofFromUBytes+
1351  p=(uint8_t *)uprv_malloc(size);
1352  if(p==NULL) {
1353  *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1354  return FALSE;
1355  }
1356 
1357  /* copy and modify the to-Unicode state table */
1358  newStateTable=(int32_t (*)[256])p;
1359  uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1360 
1363 
1364  /* copy and modify the from-Unicode result table */
1365  newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1366  uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1367 
1368  /* conveniently, the table access macros work on the left side of expressions */
1369  if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1372  } else /* MBCS_OUTPUT_2_SISO */ {
1373  stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1374  MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1375 
1376  stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1377  MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1378  }
1379 
1380  /* set the canonical converter name */
1381  name=(char *)newResults+sizeofFromUBytes;
1382  uprv_strcpy(name, sharedData->staticData->name);
1384 
1385  /* set the pointers */
1387  if(mbcsTable->swapLFNLStateTable==NULL) {
1388  mbcsTable->swapLFNLStateTable=newStateTable;
1389  mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1390  mbcsTable->swapLFNLName=name;
1391 
1392  newStateTable=NULL;
1393  }
1395 
1396  /* release the allocated memory if another thread beat us to it */
1397  if(newStateTable!=NULL) {
1398  uprv_free(newStateTable);
1399  }
1400  return TRUE;
1401 }
1402 
1403 /* reconstitute omitted fromUnicode data ------------------------------------ */
1404 
1405 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1406 static UBool U_CALLCONV
1407 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1409  const uint16_t *table;
1410  uint32_t *stage2;
1411  uint8_t *bytes, *p;
1412  UChar32 c;
1413  int32_t i, st3;
1414 
1415  table=mbcsTable->fromUnicodeTable;
1416  bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1417 
1418  /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1419  switch(mbcsTable->outputType) {
1420  case MBCS_OUTPUT_3_EUC:
1421  if(value<=0xffff) {
1422  /* short sequences are stored directly */
1423  /* code set 0 or 1 */
1424  } else if(value<=0x8effff) {
1425  /* code set 2 */
1426  value&=0x7fff;
1427  } else /* first byte is 0x8f */ {
1428  /* code set 3 */
1429  value&=0xff7f;
1430  }
1431  break;
1432  case MBCS_OUTPUT_4_EUC:
1433  if(value<=0xffffff) {
1434  /* short sequences are stored directly */
1435  /* code set 0 or 1 */
1436  } else if(value<=0x8effffff) {
1437  /* code set 2 */
1438  value&=0x7fffff;
1439  } else /* first byte is 0x8f */ {
1440  /* code set 3 */
1441  value&=0xff7fff;
1442  }
1443  break;
1444  default:
1445  break;
1446  }
1447 
1448  for(i=0; i<=0x1f; ++value, ++i) {
1449  c=codePoints[i];
1450  if(c<0) {
1451  continue;
1452  }
1453 
1454  /* locate the stage 2 & 3 data */
1455  stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1456  p=bytes;
1457  st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1458 
1459  /* write the codepage bytes into stage 3 */
1460  switch(mbcsTable->outputType) {
1461  case MBCS_OUTPUT_3:
1462  case MBCS_OUTPUT_4_EUC:
1463  p+=st3*3;
1464  p[0]=(uint8_t)(value>>16);
1465  p[1]=(uint8_t)(value>>8);
1466  p[2]=(uint8_t)value;
1467  break;
1468  case MBCS_OUTPUT_4:
1469  ((uint32_t *)p)[st3]=value;
1470  break;
1471  default:
1472  /* 2 bytes per character */
1473  ((uint16_t *)p)[st3]=(uint16_t)value;
1474  break;
1475  }
1476 
1477  /* set the roundtrip flag */
1478  *stage2|=(1UL<<(16+(c&0xf)));
1479  }
1480  return TRUE;
1481  }
1482 
1483 static void
1485  uint32_t stage1Length, uint32_t stage2Length,
1486  uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
1487  UErrorCode *pErrorCode) {
1488  uint16_t *stage1;
1489  uint32_t *stage2;
1490  uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1491  mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1492  if(mbcsTable->reconstitutedData==NULL) {
1493  *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1494  return;
1495  }
1496  uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1497 
1498  /* copy existing data and reroute the pointers */
1499  stage1=(uint16_t *)mbcsTable->reconstitutedData;
1500  uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1501 
1502  stage2=(uint32_t *)(stage1+stage1Length);
1503  uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1504  mbcsTable->fromUnicodeTable+stage1Length,
1505  stage2Length*4);
1506 
1507  mbcsTable->fromUnicodeTable=stage1;
1508  mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
1509 
1510  /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1511  stage2=(uint32_t *)stage1;
1512 
1513  /* reconstitute the initial part of stage 2 from the mbcsIndex */
1514  {
1515  int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1516  int32_t stageUTF8Index=0;
1517  int32_t st1, st2, st3, i;
1518 
1519  for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1520  st2=stage1[st1];
1521  if(st2!=(int32_t)stage1Length/2) {
1522  /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1523  for(i=0; i<16; ++i) {
1524  st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1525  if(st3!=0) {
1526  /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1527  st3>>=4;
1528  /*
1529  * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1530  * allocated together as a single 64-block for access from the mbcsIndex
1531  */
1532  stage2[st2++]=st3++;
1533  stage2[st2++]=st3++;
1534  stage2[st2++]=st3++;
1535  stage2[st2++]=st3;
1536  } else {
1537  /* no stage 3 block, skip */
1538  st2+=4;
1539  }
1540  }
1541  } else {
1542  /* no stage 2 block, skip */
1543  stageUTF8Index+=16;
1544  }
1545  }
1546  }
1547 
1548  /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1549  ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1550 }
1551 
1552 /* MBCS setup functions ----------------------------------------------------- */
1553 
1554 static void U_CALLCONV
1556  UConverterLoadArgs *pArgs,
1557  const uint8_t *raw,
1558  UErrorCode *pErrorCode) {
1559  UDataInfo info;
1560  UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1561  _MBCSHeader *header=(_MBCSHeader *)raw;
1562  uint32_t offset;
1563  uint32_t headerLength;
1564  UBool noFromU=FALSE;
1565 
1566  if(header->version[0]==4) {
1567  headerLength=MBCS_HEADER_V4_LENGTH;
1568  } else if(header->version[0]==5 && header->version[1]>=3 &&
1570  headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1571  noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1572  } else {
1573  *pErrorCode=U_INVALID_TABLE_FORMAT;
1574  return;
1575  }
1576 
1577  mbcsTable->outputType=(uint8_t)header->flags;
1578  if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1579  *pErrorCode=U_INVALID_TABLE_FORMAT;
1580  return;
1581  }
1582 
1583  /* extension data, header version 4.2 and higher */
1584  offset=header->flags>>8;
1585  if(offset!=0) {
1586  mbcsTable->extIndexes=(const int32_t *)(raw+offset);
1587  }
1588 
1589  if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1591  UConverterSharedData *baseSharedData;
1592  const int32_t *extIndexes;
1593  const char *baseName;
1594 
1595  /* extension-only file, load the base table and set values appropriately */
1596  if((extIndexes=mbcsTable->extIndexes)==NULL) {
1597  /* extension-only file without extension */
1598  *pErrorCode=U_INVALID_TABLE_FORMAT;
1599  return;
1600  }
1601 
1602  if(pArgs->nestedLoads!=1) {
1603  /* an extension table must not be loaded as a base table */
1604  *pErrorCode=U_INVALID_TABLE_FILE;
1605  return;
1606  }
1607 
1608  /* load the base table */
1609  baseName=(const char *)header+headerLength*4;
1610  if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1611  /* forbid loading this same extension-only file */
1612  *pErrorCode=U_INVALID_TABLE_FORMAT;
1613  return;
1614  }
1615 
1616  /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1617  args.size=sizeof(UConverterLoadArgs);
1618  args.nestedLoads=2;
1619  args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
1620  args.reserved=pArgs->reserved;
1621  args.options=pArgs->options;
1622  args.pkg=pArgs->pkg;
1623  args.name=baseName;
1624  baseSharedData=ucnv_load(&args, pErrorCode);
1625  if(U_FAILURE(*pErrorCode)) {
1626  return;
1627  }
1628  if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1629  baseSharedData->mbcs.baseSharedData!=NULL
1630  ) {
1631  ucnv_unload(baseSharedData);
1632  *pErrorCode=U_INVALID_TABLE_FORMAT;
1633  return;
1634  }
1635  if(pArgs->onlyTestIsLoadable) {
1636  /*
1637  * Exit as soon as we know that we can load the converter
1638  * and the format is valid and supported.
1639  * The worst that can happen in the following code is a memory
1640  * allocation error.
1641  */
1642  ucnv_unload(baseSharedData);
1643  return;
1644  }
1645 
1646  /* copy the base table data */
1647  uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1648 
1649  /* overwrite values with relevant ones for the extension converter */
1650  mbcsTable->baseSharedData=baseSharedData;
1651  mbcsTable->extIndexes=extIndexes;
1652 
1653  /*
1654  * It would be possible to share the swapLFNL data with a base converter,
1655  * but the generated name would have to be different, and the memory
1656  * would have to be free'd only once.
1657  * It is easier to just create the data for the extension converter
1658  * separately when it is requested.
1659  */
1660  mbcsTable->swapLFNLStateTable=NULL;
1661  mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1662  mbcsTable->swapLFNLName=NULL;
1663 
1664  /*
1665  * The reconstitutedData must be deleted only when the base converter
1666  * is unloaded.
1667  */
1668  mbcsTable->reconstitutedData=NULL;
1669 
1670  /*
1671  * Set a special, runtime-only outputType if the extension converter
1672  * is a DBCS version of a base converter that also maps single bytes.
1673  */
1674  if( sharedData->staticData->conversionType==UCNV_DBCS ||
1675  (sharedData->staticData->conversionType==UCNV_MBCS &&
1676  sharedData->staticData->minBytesPerChar>=2)
1677  ) {
1678  if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1679  /* the base converter is SI/SO-stateful */
1680  int32_t entry;
1681 
1682  /* get the dbcs state from the state table entry for SO=0x0e */
1683  entry=mbcsTable->stateTable[0][0xe];
1684  if( MBCS_ENTRY_IS_FINAL(entry) &&
1687  ) {
1689 
1690  mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1691  }
1692  } else if(
1693  baseSharedData->staticData->conversionType==UCNV_MBCS &&
1694  baseSharedData->staticData->minBytesPerChar==1 &&
1695  baseSharedData->staticData->maxBytesPerChar==2 &&
1696  mbcsTable->countStates<=127
1697  ) {
1698  /* non-stateful base converter, need to modify the state table */
1699  int32_t (*newStateTable)[256];
1700  int32_t *state;
1701  int32_t i, count;
1702 
1703  /* allocate a new state table and copy the base state table contents */
1704  count=mbcsTable->countStates;
1705  newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1706  if(newStateTable==NULL) {
1707  ucnv_unload(baseSharedData);
1708  *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1709  return;
1710  }
1711 
1712  uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1713 
1714  /* change all final single-byte entries to go to a new all-illegal state */
1715  state=newStateTable[0];
1716  for(i=0; i<256; ++i) {
1717  if(MBCS_ENTRY_IS_FINAL(state[i])) {
1719  }
1720  }
1721 
1722  /* build the new all-illegal state */
1723  state=newStateTable[count];
1724  for(i=0; i<256; ++i) {
1726  }
1727  mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1728  mbcsTable->countStates=(uint8_t)(count+1);
1729  mbcsTable->stateTableOwned=TRUE;
1730 
1731  mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1732  }
1733  }
1734 
1735  /*
1736  * unlike below for files with base tables, do not get the unicodeMask
1737  * from the sharedData; instead, use the base table's unicodeMask,
1738  * which we copied in the memcpy above;
1739  * this is necessary because the static data unicodeMask, especially
1740  * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1741  */
1742  } else {
1743  /* conversion file with a base table; an additional extension table is optional */
1744  /* make sure that the output type is known */
1745  switch(mbcsTable->outputType) {
1746  case MBCS_OUTPUT_1:
1747  case MBCS_OUTPUT_2:
1748  case MBCS_OUTPUT_3:
1749  case MBCS_OUTPUT_4:
1750  case MBCS_OUTPUT_3_EUC:
1751  case MBCS_OUTPUT_4_EUC:
1752  case MBCS_OUTPUT_2_SISO:
1753  /* OK */
1754  break;
1755  default:
1756  *pErrorCode=U_INVALID_TABLE_FORMAT;
1757  return;
1758  }
1759  if(pArgs->onlyTestIsLoadable) {
1760  /*
1761  * Exit as soon as we know that we can load the converter
1762  * and the format is valid and supported.
1763  * The worst that can happen in the following code is a memory
1764  * allocation error.
1765  */
1766  return;
1767  }
1768 
1769  mbcsTable->countStates=(uint8_t)header->countStates;
1770  mbcsTable->countToUFallbacks=header->countToUFallbacks;
1771  mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
1772  mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1773  mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1774 
1775  mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1776  mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1777  mbcsTable->fromUBytesLength=header->fromUBytesLength;
1778 
1779  /*
1780  * converter versions 6.1 and up contain a unicodeMask that is
1781  * used here to select the most efficient function implementations
1782  */
1783  info.size=sizeof(UDataInfo);
1784  udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1785  if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1786  /* mask off possible future extensions to be safe */
1787  mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1788  } else {
1789  /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1791  }
1792 
1793  /*
1794  * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1795  * Check for the header version, SBCS vs. MBCS, and for whether the
1796  * data structures are optimized for code points as high as what the
1797  * runtime code is designed for.
1798  * The implementation does not handle mapping tables with entries for
1799  * unpaired surrogates.
1800  */
1801  if( header->version[1]>=3 &&
1802  (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1803  (mbcsTable->countStates==1 ?
1804  (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1805  (header->version[2]>=(MBCS_FAST_MAX>>8))
1806  )
1807  ) {
1808  mbcsTable->utf8Friendly=TRUE;
1809 
1810  if(mbcsTable->countStates==1) {
1811  /*
1812  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1813  * Build a table with indexes to each block, to be used instead of
1814  * the regular stage 1/2 table.
1815  */
1816  int32_t i;
1817  for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1818  mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1819  }
1820  /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1821  mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1822  } else {
1823  /*
1824  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1825  * The .cnv file is prebuilt with an additional stage table with indexes
1826  * to each block.
1827  */
1828  mbcsTable->mbcsIndex=(const uint16_t *)
1829  (mbcsTable->fromUnicodeBytes+
1830  (noFromU ? 0 : mbcsTable->fromUBytesLength));
1831  mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1832  }
1833  }
1834 
1835  /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1836  {
1837  uint32_t asciiRoundtrips=0xffffffff;
1838  int32_t i;
1839 
1840  for(i=0; i<0x80; ++i) {
1841  if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1842  asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1843  }
1844  }
1845  mbcsTable->asciiRoundtrips=asciiRoundtrips;
1846  }
1847 
1848  if(noFromU) {
1849  uint32_t stage1Length=
1850  mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1851  0x440 : 0x40;
1852  uint32_t stage2Length=
1853  (header->offsetFromUBytes-header->offsetFromUTable)/4-
1854  stage1Length/2;
1855  reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1856  }
1857  }
1858 
1859  /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1860  if(mbcsTable->utf8Friendly) {
1861  if(mbcsTable->countStates==1) {
1862  sharedData->impl=&_SBCSUTF8Impl;
1863  } else {
1864  if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1865  sharedData->impl=&_DBCSUTF8Impl;
1866  }
1867  }
1868  }
1869 
1870  if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1871  /*
1872  * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1873  * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1874  */
1875  mbcsTable->asciiRoundtrips=0;
1876  }
1877 }
1878 
1879 static void U_CALLCONV
1881  UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1882 
1883  if(mbcsTable->swapLFNLStateTable!=NULL) {
1884  uprv_free(mbcsTable->swapLFNLStateTable);
1885  }
1886  if(mbcsTable->stateTableOwned) {
1887  uprv_free((void *)mbcsTable->stateTable);
1888  }
1889  if(mbcsTable->baseSharedData!=NULL) {
1890  ucnv_unload(mbcsTable->baseSharedData);
1891  }
1892  if(mbcsTable->reconstitutedData!=NULL) {
1893  uprv_free(mbcsTable->reconstitutedData);
1894  }
1895 }
1896 
1897 static void U_CALLCONV
1899  UConverterLoadArgs *pArgs,
1900  UErrorCode *pErrorCode) {
1901  UConverterMBCSTable *mbcsTable;
1902  const int32_t *extIndexes;
1903  uint8_t outputType;
1904  int8_t maxBytesPerUChar;
1905 
1906  if(pArgs->onlyTestIsLoadable) {
1907  return;
1908  }
1909 
1910  mbcsTable=&cnv->sharedData->mbcs;
1911  outputType=mbcsTable->outputType;
1912 
1913  if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1914  /* the swaplfnl option does not apply, remove it */
1915  cnv->options=pArgs->options&=~~UCNV_OPTION_SWAP_LFNL;
1916  }
1917 
1918  if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1919  /* do this because double-checked locking is broken */
1920  UBool isCached;
1921 
1923  isCached=mbcsTable->swapLFNLStateTable!=NULL;
1925 
1926  if(!isCached) {
1927  if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1928  if(U_FAILURE(*pErrorCode)) {
1929  return; /* something went wrong */
1930  }
1931 
1932  /* the option does not apply, remove it */
1933  cnv->options=pArgs->options&=~~UCNV_OPTION_SWAP_LFNL;
1934  }
1935  }
1936  }
1937 
1938  if(uprv_strstr(pArgs->name, "18030")!=NULL) {
1939  if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
1940  /* set a flag for GB 18030 mode, which changes the callback behavior */
1942  }
1943  } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
1944  /* set a flag for KEIS converter, which changes the SI/SO character sequence */
1946  } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
1947  /* set a flag for JEF converter, which changes the SI/SO character sequence */
1949  } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
1950  /* set a flag for JIPS converter, which changes the SI/SO character sequence */
1952  }
1953 
1954  /* fix maxBytesPerUChar depending on outputType and options etc. */
1955  if(outputType==MBCS_OUTPUT_2_SISO) {
1956  cnv->maxBytesPerUChar=3; /* SO+DBCS */
1957  }
1958 
1959  extIndexes=mbcsTable->extIndexes;
1960  if(extIndexes!=NULL) {
1961  maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1962  if(outputType==MBCS_OUTPUT_2_SISO) {
1963  ++maxBytesPerUChar; /* SO + multiple DBCS */
1964  }
1965 
1966  if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1967  cnv->maxBytesPerUChar=maxBytesPerUChar;
1968  }
1969  }
1970 
1971 #if 0
1972  /*
1973  * documentation of UConverter fields used for status
1974  * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1975  */
1976 
1977  /* toUnicode */
1978  cnv->toUnicodeStatus=0; /* offset */
1979  cnv->mode=0; /* state */
1980  cnv->toULength=0; /* byteIndex */
1981 
1982  /* fromUnicode */
1983  cnv->fromUChar32=0;
1984  cnv->fromUnicodeStatus=1; /* prevLength */
1985 #endif
1986 }
1987 
1989 
1990 static const char* U_CALLCONV
1993  return cnv->sharedData->mbcs.swapLFNLName;
1994  } else {
1995  return cnv->sharedData->staticData->name;
1996  }
1997 }
1999 
2000 
2001 /* MBCS-to-Unicode conversion functions ------------------------------------- */
2002 
2003 static UChar32 U_CALLCONV
2005  const _MBCSToUFallback *toUFallbacks;
2006  uint32_t i, start, limit;
2007 
2008  limit=mbcsTable->countToUFallbacks;
2009  if(limit>0) {
2010  /* do a binary search for the fallback mapping */
2011  toUFallbacks=mbcsTable->toUFallbacks;
2012  start=0;
2013  while(start<limit-1) {
2014  i=(start+limit)/2;
2015  if(offset<toUFallbacks[i].offset) {
2016  limit=i;
2017  } else {
2018  start=i;
2019  }
2020  }
2021 
2022  /* did we really find it? */
2023  if(offset==toUFallbacks[start].offset) {
2024  return toUFallbacks[start].codePoint;
2025  }
2026  }
2027 
2028  return 0xfffe;
2029 }
2030 
2031 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
2032 static void
2034  UErrorCode *pErrorCode) {
2035  UConverter *cnv;
2036  const uint8_t *source, *sourceLimit;
2037  UChar *target;
2038  const UChar *targetLimit;
2039  int32_t *offsets;
2040 
2041  const int32_t (*stateTable)[256];
2042 
2043  int32_t sourceIndex;
2044 
2045  int32_t entry;
2046  UChar c;
2047  uint8_t action;
2048 
2049  /* set up the local pointers */
2050  cnv=pArgs->converter;
2051  source=(const uint8_t *)pArgs->source;
2052  sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2053  target=pArgs->target;
2054  targetLimit=pArgs->targetLimit;
2055  offsets=pArgs->offsets;
2056 
2057  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2058  stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2059  } else {
2060  stateTable=cnv->sharedData->mbcs.stateTable;
2061  }
2062 
2063  /* sourceIndex=-1 if the current character began in the previous buffer */
2064  sourceIndex=0;
2065 
2066  /* conversion loop */
2067  while(source<sourceLimit) {
2068  /*
2069  * This following test is to see if available input would overflow the output.
2070  * It does not catch output of more than one code unit that
2071  * overflows as a result of a surrogate pair or callback output
2072  * from the last source byte.
2073  * Therefore, those situations also test for overflows and will
2074  * then break the loop, too.
2075  */
2076  if(target>=targetLimit) {
2077  /* target is full */
2078  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2079  break;
2080  }
2081 
2082  entry=stateTable[0][*source++];
2083  /* MBCS_ENTRY_IS_FINAL(entry) */
2084 
2085  /* test the most common case first */
2087  /* output BMP code point */
2089  if(offsets!=NULL) {
2090  *offsets++=sourceIndex;
2091  }
2092 
2093  /* normal end of action codes: prepare for a new character */
2094  ++sourceIndex;
2095  continue;
2096  }
2097 
2098  /*
2099  * An if-else-if chain provides more reliable performance for
2100  * the most common cases compared to a switch.
2101  */
2105  ) {
2107  /* output surrogate pair */
2108  *target++=(UChar)(0xd800|(UChar)(entry>>10));
2109  if(offsets!=NULL) {
2110  *offsets++=sourceIndex;
2111  }
2112  c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2113  if(target<targetLimit) {
2114  *target++=c;
2115  if(offsets!=NULL) {
2116  *offsets++=sourceIndex;
2117  }
2118  } else {
2119  /* target overflow */
2120  cnv->UCharErrorBuffer[0]=c;
2122  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2123  break;
2124  }
2125 
2126  ++sourceIndex;
2127  continue;
2130  /* output BMP code point */
2132  if(offsets!=NULL) {
2133  *offsets++=sourceIndex;
2134  }
2135 
2136  ++sourceIndex;
2137  continue;
2138  }
2139  } else if(action==MBCS_STATE_UNASSIGNED) {
2140  /* just fall through */
2141  } else if(action==MBCS_STATE_ILLEGAL) {
2142  /* callback(illegal) */
2143  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2144  } else {
2145  /* reserved, must never occur */
2146  ++sourceIndex;
2147  continue;
2148  }
2149 
2150  if(U_FAILURE(*pErrorCode)) {
2151  /* callback(illegal) */
2152  break;
2153  } else /* unassigned sequences indicated with byteIndex>0 */ {
2154  /* try an extension mapping */
2155  pArgs->source=(const char *)source;
2156  cnv->toUBytes[0]=*(source-1);
2158  1, &source, sourceLimit,
2159  &target, targetLimit,
2160  &offsets, sourceIndex,
2161  pArgs->flush,
2162  pErrorCode);
2163  sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
2164 
2165  if(U_FAILURE(*pErrorCode)) {
2166  /* not mappable or buffer overflow */
2167  break;
2168  }
2169  }
2170  }
2171 
2172  /* write back the updated pointers */
2173  pArgs->source=(const char *)source;
2174  pArgs->target=target;
2175  pArgs->offsets=offsets;
2176 }
2177 
2178 /*
2179  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
2180  * that only map to and from the BMP.
2181  * In addition to single-byte optimizations, the offset calculations
2182  * become much easier.
2183  */
2184 static void
2186  UErrorCode *pErrorCode) {
2187  UConverter *cnv;
2188  const uint8_t *source, *sourceLimit, *lastSource;
2189  UChar *target;
2190  int32_t targetCapacity, length;
2191  int32_t *offsets;
2192 
2193  const int32_t (*stateTable)[256];
2194 
2195  int32_t sourceIndex;
2196 
2197  int32_t entry;
2198  uint8_t action;
2199 
2200  /* set up the local pointers */
2201  cnv=pArgs->converter;
2202  source=(const uint8_t *)pArgs->source;
2203  sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2204  target=pArgs->target;
2205  targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
2206  offsets=pArgs->offsets;
2207 
2208  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2209  stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2210  } else {
2211  stateTable=cnv->sharedData->mbcs.stateTable;
2212  }
2213 
2214  /* sourceIndex=-1 if the current character began in the previous buffer */
2215  sourceIndex=0;
2216  lastSource=source;
2217 
2218  /*
2219  * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2220  * for the minimum of the sourceLength and targetCapacity
2221  */
2222  length=(int32_t)(sourceLimit-source);
2223  if(length<targetCapacity) {
2224  targetCapacity=length;
2225  }
2226 
2227 #if MBCS_UNROLL_SINGLE_TO_BMP
2228  /* unrolling makes it faster on Pentium III/Windows 2000 */
2229  /* unroll the loop with the most common case */
2230 unrolled:
2231  if(targetCapacity>=16) {
2232  int32_t count, loops, oredEntries;
2233 
2234  loops=count=targetCapacity>>4;
2235  do {
2236  oredEntries=entry=stateTable[0][*source++];
2238  oredEntries|=entry=stateTable[0][*source++];
2240  oredEntries|=entry=stateTable[0][*source++];
2242  oredEntries|=entry=stateTable[0][*source++];
2244  oredEntries|=entry=stateTable[0][*source++];
2246  oredEntries|=entry=stateTable[0][*source++];
2248  oredEntries|=entry=stateTable[0][*source++];
2250  oredEntries|=entry=stateTable[0][*source++];
2252  oredEntries|=entry=stateTable[0][*source++];
2254  oredEntries|=entry=stateTable[0][*source++];
2256  oredEntries|=entry=stateTable[0][*source++];
2258  oredEntries|=entry=stateTable[0][*source++];
2260  oredEntries|=entry=stateTable[0][*source++];
2262  oredEntries|=entry=stateTable[0][*source++];
2264  oredEntries|=entry=stateTable[0][*source++];
2266  oredEntries|=entry=stateTable[0][*source++];
2268 
2269  /* were all 16 entries really valid? */
2270  if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2271  /* no, return to the first of these 16 */
2272  source-=16;
2273  target-=16;
2274  break;
2275  }
2276  } while(--count>0);
2277  count=loops-count;
2278  targetCapacity-=16*count;
2279 
2280  if(offsets!=NULL) {
2281  lastSource+=16*count;
2282  while(count>0) {
2283  *offsets++=sourceIndex++;
2284  *offsets++=sourceIndex++;
2285  *offsets++=sourceIndex++;
2286  *offsets++=sourceIndex++;
2287  *offsets++=sourceIndex++;
2288  *offsets++=sourceIndex++;
2289  *offsets++=sourceIndex++;
2290  *offsets++=sourceIndex++;
2291  *offsets++=sourceIndex++;
2292  *offsets++=sourceIndex++;
2293  *offsets++=sourceIndex++;
2294  *offsets++=sourceIndex++;
2295  *offsets++=sourceIndex++;
2296  *offsets++=sourceIndex++;
2297  *offsets++=sourceIndex++;
2298  *offsets++=sourceIndex++;
2299  --count;
2300  }
2301  }
2302  }
2303 #endif
2304 
2305  /* conversion loop */
2306  while(targetCapacity > 0 && source < sourceLimit) {
2307  entry=stateTable[0][*source++];
2308  /* MBCS_ENTRY_IS_FINAL(entry) */
2309 
2310  /* test the most common case first */
2312  /* output BMP code point */
2314  --targetCapacity;
2315  continue;
2316  }
2317 
2318  /*
2319  * An if-else-if chain provides more reliable performance for
2320  * the most common cases compared to a switch.
2321  */
2325  /* output BMP code point */
2327  --targetCapacity;
2328  continue;
2329  }
2330  } else if(action==MBCS_STATE_UNASSIGNED) {
2331  /* just fall through */
2332  } else if(action==MBCS_STATE_ILLEGAL) {
2333  /* callback(illegal) */
2334  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2335  } else {
2336  /* reserved, must never occur */
2337  continue;
2338  }
2339 
2340  /* set offsets since the start or the last extension */
2341  if(offsets!=NULL) {
2342  int32_t count=(int32_t)(source-lastSource);
2343 
2344  /* predecrement: do not set the offset for the callback-causing character */
2345  while(--count>0) {
2346  *offsets++=sourceIndex++;
2347  }
2348  /* offset and sourceIndex are now set for the current character */
2349  }
2350 
2351  if(U_FAILURE(*pErrorCode)) {
2352  /* callback(illegal) */
2353  break;
2354  } else /* unassigned sequences indicated with byteIndex>0 */ {
2355  /* try an extension mapping */
2356  lastSource=source;
2357  cnv->toUBytes[0]=*(source-1);
2359  1, &source, sourceLimit,
2360  &target, pArgs->targetLimit,
2361  &offsets, sourceIndex,
2362  pArgs->flush,
2363  pErrorCode);
2364  sourceIndex+=1+(int32_t)(source-lastSource);
2365 
2366  if(U_FAILURE(*pErrorCode)) {
2367  /* not mappable or buffer overflow */
2368  break;
2369  }
2370 
2371  /* recalculate the targetCapacity after an extension mapping */
2372  targetCapacity=(int32_t)(pArgs->targetLimit-target);
2373  length=(int32_t)(sourceLimit-source);
2374  if(length<targetCapacity) {
2375  targetCapacity=length;
2376  }
2377  }
2378 
2379 #if MBCS_UNROLL_SINGLE_TO_BMP
2380  /* unrolling makes it faster on Pentium III/Windows 2000 */
2381  goto unrolled;
2382 #endif
2383  }
2384 
2385  if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2386  /* target is full */
2387  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2388  }
2389 
2390  /* set offsets since the start or the last callback */
2391  if(offsets!=NULL) {
2392  size_t count=source-lastSource;
2393  while(count>0) {
2394  *offsets++=sourceIndex++;
2395  --count;
2396  }
2397  }
2398 
2399  /* write back the updated pointers */
2400  pArgs->source=(const char *)source;
2401  pArgs->target=target;
2402  pArgs->offsets=offsets;
2403 }
2404 
2405 static UBool
2406 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2407  const int32_t *row=stateTable[state];
2408  int32_t b, entry;
2409  /* First test for final entries in this state for some commonly valid byte values. */
2410  entry=row[0xa1];
2413  ) {
2414  return TRUE;
2415  }
2416  entry=row[0x41];
2419  ) {
2420  return TRUE;
2421  }
2422  /* Then test for final entries in this state. */
2423  for(b=0; b<=0xff; ++b) {
2424  entry=row[b];
2427  ) {
2428  return TRUE;
2429  }
2430  }
2431  /* Then recurse for transition entries. */
2432  for(b=0; b<=0xff; ++b) {
2433  entry=row[b];
2436  ) {
2437  return TRUE;
2438  }
2439  }
2440  return FALSE;
2441 }
2442 
2443 /*
2444  * Is byte b a single/lead byte in this state?
2445  * Recurse for transition states, because here we don't want to say that
2446  * b is a lead byte if all byte sequences that start with b are illegal.
2447  */
2448 static UBool
2449 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2450  const int32_t *row=stateTable[state];
2451  int32_t entry=row[b];
2452  if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
2454  } else {
2456  if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2457  return FALSE; /* SI/SO are illegal for DBCS-only conversion */
2458  } else {
2459  return action!=MBCS_STATE_ILLEGAL;
2460  }
2461  }
2462 }
2463 
2464 U_CFUNC void
2466  UErrorCode *pErrorCode) {
2467  UConverter *cnv;
2468  const uint8_t *source, *sourceLimit;
2469  UChar *target;
2470  const UChar *targetLimit;
2471  int32_t *offsets;
2472 
2473  const int32_t (*stateTable)[256];
2474  const uint16_t *unicodeCodeUnits;
2475 
2476  uint32_t offset;
2477  uint8_t state;
2478  int8_t byteIndex;
2479  uint8_t *bytes;
2480 
2481  int32_t sourceIndex, nextSourceIndex;
2482 
2483  int32_t entry;
2484  UChar c;
2485  uint8_t action;
2486 
2487  /* use optimized function if possible */
2488  cnv=pArgs->converter;
2489 
2490  if(cnv->preToULength>0) {
2491  /*
2492  * pass sourceIndex=-1 because we continue from an earlier buffer
2493  * in the future, this may change with continuous offsets
2494  */
2495  ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2496 
2497  if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2498  return;
2499  }
2500  }
2501 
2502  if(cnv->sharedData->mbcs.countStates==1) {
2504  ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2505  } else {
2506  ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2507  }
2508  return;
2509  }
2510 
2511  /* set up the local pointers */
2512  source=(const uint8_t *)pArgs->source;
2513  sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2514  target=pArgs->target;
2515  targetLimit=pArgs->targetLimit;
2516  offsets=pArgs->offsets;
2517 
2518  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2519  stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2520  } else {
2521  stateTable=cnv->sharedData->mbcs.stateTable;
2522  }
2523  unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2524 
2525  /* get the converter state from UConverter */
2527  byteIndex=cnv->toULength;
2528  bytes=cnv->toUBytes;
2529 
2530  /*
2531  * if we are in the SBCS state for a DBCS-only converter,
2532  * then load the DBCS state from the MBCS data
2533  * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2534  */
2535  if((state=(uint8_t)(cnv->mode))==0) {
2537  }
2538 
2539  /* sourceIndex=-1 if the current character began in the previous buffer */
2540  sourceIndex=byteIndex==0 ? 0 : -1;
2541  nextSourceIndex=0;
2542 
2543  /* conversion loop */
2544  while(source<sourceLimit) {
2545  /*
2546  * This following test is to see if available input would overflow the output.
2547  * It does not catch output of more than one code unit that
2548  * overflows as a result of a surrogate pair or callback output
2549  * from the last source byte.
2550  * Therefore, those situations also test for overflows and will
2551  * then break the loop, too.
2552  */
2553  if(target>=targetLimit) {
2554  /* target is full */
2555  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2556  break;
2557  }
2558 
2559  if(byteIndex==0) {
2560  /* optimized loop for 1/2-byte input and BMP output */
2561  if(offsets==NULL) {
2562  do {
2563  entry=stateTable[state][*source];
2567 
2568  ++source;
2569  if( source<sourceLimit &&
2570  MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2572  (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2573  ) {
2574  ++source;
2575  *target++=c;
2576  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2577  offset=0;
2578  } else {
2579  /* set the state and leave the optimized loop */
2580  bytes[0]=*(source-1);
2581  byteIndex=1;
2582  break;
2583  }
2584  } else {
2586  /* output BMP code point */
2587  ++source;
2589  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2590  } else {
2591  /* leave the optimized loop */
2592  break;
2593  }
2594  }
2595  } while(source<sourceLimit && target<targetLimit);
2596  } else /* offsets!=NULL */ {
2597  do {
2598  entry=stateTable[state][*source];
2602 
2603  ++source;
2604  if( source<sourceLimit &&
2605  MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2607  (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2608  ) {
2609  ++source;
2610  *target++=c;
2611  if(offsets!=NULL) {
2612  *offsets++=sourceIndex;
2613  sourceIndex=(nextSourceIndex+=2);
2614  }
2615  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2616  offset=0;
2617  } else {
2618  /* set the state and leave the optimized loop */
2619  ++nextSourceIndex;
2620  bytes[0]=*(source-1);
2621  byteIndex=1;
2622  break;
2623  }
2624  } else {
2626  /* output BMP code point */
2627  ++source;
2629  if(offsets!=NULL) {
2630  *offsets++=sourceIndex;
2631  sourceIndex=++nextSourceIndex;
2632  }
2633  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2634  } else {
2635  /* leave the optimized loop */
2636  break;
2637  }
2638  }
2639  } while(source<sourceLimit && target<targetLimit);
2640  }
2641 
2642  /*
2643  * these tests and break statements could be put inside the loop
2644  * if C had "break outerLoop" like Java
2645  */
2646  if(source>=sourceLimit) {
2647  break;
2648  }
2649  if(target>=targetLimit) {
2650  /* target is full */
2651  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2652  break;
2653  }
2654 
2655  ++nextSourceIndex;
2656  bytes[byteIndex++]=*source++;
2657  } else /* byteIndex>0 */ {
2658  ++nextSourceIndex;
2659  entry=stateTable[state][bytes[byteIndex++]=*source++];
2660  }
2661 
2665  continue;
2666  }
2667 
2668  /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2669  cnv->mode=state;
2670 
2671  /* set the next state early so that we can reuse the entry variable */
2672  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2673 
2674  /*
2675  * An if-else-if chain provides more reliable performance for
2676  * the most common cases compared to a switch.
2677  */
2681  c=unicodeCodeUnits[offset];
2682  if(c<0xfffe) {
2683  /* output BMP code point */
2684  *target++=c;
2685  if(offsets!=NULL) {
2686  *offsets++=sourceIndex;
2687  }
2688  byteIndex=0;
2689  } else if(c==0xfffe) {
2691  /* output fallback BMP code point */
2692  *target++=(UChar)entry;
2693  if(offsets!=NULL) {
2694  *offsets++=sourceIndex;
2695  }
2696  byteIndex=0;
2697  }
2698  } else {
2699  /* callback(illegal) */
2700  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2701  }
2702  } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2703  /* output BMP code point */
2705  if(offsets!=NULL) {
2706  *offsets++=sourceIndex;
2707  }
2708  byteIndex=0;
2709  } else if(action==MBCS_STATE_VALID_16_PAIR) {
2711  c=unicodeCodeUnits[offset++];
2712  if(c<0xd800) {
2713  /* output BMP code point below 0xd800 */
2714  *target++=c;
2715  if(offsets!=NULL) {
2716  *offsets++=sourceIndex;
2717  }
2718  byteIndex=0;
2719  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2720  /* output roundtrip or fallback surrogate pair */
2721  *target++=(UChar)(c&0xdbff);
2722  if(offsets!=NULL) {
2723  *offsets++=sourceIndex;
2724  }
2725  byteIndex=0;
2726  if(target<targetLimit) {
2727  *target++=unicodeCodeUnits[offset];
2728  if(offsets!=NULL) {
2729  *offsets++=sourceIndex;
2730  }
2731  } else {
2732  /* target overflow */
2733  cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2735  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2736 
2737  offset=0;
2738  break;
2739  }
2740  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2741  /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2742  *target++=unicodeCodeUnits[offset];
2743  if(offsets!=NULL) {
2744  *offsets++=sourceIndex;
2745  }
2746  byteIndex=0;
2747  } else if(c==0xffff) {
2748  /* callback(illegal) */
2749  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2750  }
2751  } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2753  ) {
2755  /* output surrogate pair */
2756  *target++=(UChar)(0xd800|(UChar)(entry>>10));
2757  if(offsets!=NULL) {
2758  *offsets++=sourceIndex;
2759  }
2760  byteIndex=0;
2761  c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2762  if(target<targetLimit) {
2763  *target++=c;
2764  if(offsets!=NULL) {
2765  *offsets++=sourceIndex;
2766  }
2767  } else {
2768  /* target overflow */
2769  cnv->UCharErrorBuffer[0]=c;
2771  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2772 
2773  offset=0;
2774  break;
2775  }
2776  } else if(action==MBCS_STATE_CHANGE_ONLY) {
2777  /*
2778  * This serves as a state change without any output.
2779  * It is useful for reading simple stateful encodings,
2780  * for example using just Shift-In/Shift-Out codes.
2781  * The 21 unused bits may later be used for more sophisticated
2782  * state transitions.
2783  */
2784  if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2785  byteIndex=0;
2786  } else {
2787  /* SI/SO are illegal for DBCS-only conversion */
2788  state=(uint8_t)(cnv->mode); /* restore the previous state */
2789 
2790  /* callback(illegal) */
2791  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2792  }
2795  /* output BMP code point */
2797  if(offsets!=NULL) {
2798  *offsets++=sourceIndex;
2799  }
2800  byteIndex=0;
2801  }
2802  } else if(action==MBCS_STATE_UNASSIGNED) {
2803  /* just fall through */
2804  } else if(action==MBCS_STATE_ILLEGAL) {
2805  /* callback(illegal) */
2806  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2807  } else {
2808  /* reserved, must never occur */
2809  byteIndex=0;
2810  }
2811 
2812  /* end of action codes: prepare for a new character */
2813  offset=0;
2814 
2815  if(byteIndex==0) {
2816  sourceIndex=nextSourceIndex;
2817  } else if(U_FAILURE(*pErrorCode)) {
2818  /* callback(illegal) */
2819  if(byteIndex>1) {
2820  /*
2821  * Ticket 5691: consistent illegal sequences:
2822  * - We include at least the first byte in the illegal sequence.
2823  * - If any of the non-initial bytes could be the start of a character,
2824  * we stop the illegal sequence before the first one of those.
2825  */
2826  UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2827  int8_t i;
2828  for(i=1;
2829  i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2830  ++i) {}
2831  if(i<byteIndex) {
2832  /* Back out some bytes. */
2833  int8_t backOutDistance=byteIndex-i;
2834  int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2835  byteIndex=i; /* length of reported illegal byte sequence */
2836  if(backOutDistance<=bytesFromThisBuffer) {
2837  source-=backOutDistance;
2838  } else {
2839  /* Back out bytes from the previous buffer: Need to replay them. */
2840  cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2841  /* preToULength is negative! */
2843  source=(const uint8_t *)pArgs->source;
2844  }
2845  }
2846  }
2847  break;
2848  } else /* unassigned sequences indicated with byteIndex>0 */ {
2849  /* try an extension mapping */
2850  pArgs->source=(const char *)source;
2851  byteIndex=_extToU(cnv, cnv->sharedData,
2852  byteIndex, &source, sourceLimit,
2853  &target, targetLimit,
2854  &offsets, sourceIndex,
2855  pArgs->flush,
2856  pErrorCode);
2857  sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
2858 
2859  if(U_FAILURE(*pErrorCode)) {
2860  /* not mappable or buffer overflow */
2861  break;
2862  }
2863  }
2864  }
2865 
2866  /* set the converter state back into UConverter */
2868  cnv->mode=state;
2869  cnv->toULength=byteIndex;
2870 
2871  /* write back the updated pointers */
2872  pArgs->source=(const char *)source;
2873  pArgs->target=target;
2874  pArgs->offsets=offsets;
2875 }
2876 
2877 /*
2878  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2879  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2880  */
2881 static UChar32
2883  UErrorCode *pErrorCode) {
2884  UConverter *cnv;
2885  const int32_t (*stateTable)[256];
2886  const uint8_t *source, *sourceLimit;
2887 
2888  int32_t entry;
2889  uint8_t action;
2890 
2891  /* set up the local pointers */
2892  cnv=pArgs->converter;
2893  source=(const uint8_t *)pArgs->source;
2894  sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2895  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2896  stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2897  } else {
2898  stateTable=cnv->sharedData->mbcs.stateTable;
2899  }
2900 
2901  /* conversion loop */
2902  while(source<sourceLimit) {
2903  entry=stateTable[0][*source++];
2904  /* MBCS_ENTRY_IS_FINAL(entry) */
2905 
2906  /* write back the updated pointer early so that we can return directly */
2907  pArgs->source=(const char *)source;
2908 
2910  /* output BMP code point */
2912  }
2913 
2914  /*
2915  * An if-else-if chain provides more reliable performance for
2916  * the most common cases compared to a switch.
2917  */
2921  ) {
2922  /* output supplementary code point */
2923  return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2926  /* output BMP code point */
2928  }
2929  } else if(action==MBCS_STATE_UNASSIGNED) {
2930  /* just fall through */
2931  } else if(action==MBCS_STATE_ILLEGAL) {
2932  /* callback(illegal) */
2933  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2934  } else {
2935  /* reserved, must never occur */
2936  continue;
2937  }
2938 
2939  if(U_FAILURE(*pErrorCode)) {
2940  /* callback(illegal) */
2941  break;
2942  } else /* unassigned sequence */ {
2943  /* defer to the generic implementation */
2944  pArgs->source=(const char *)source-1;
2946  }
2947  }
2948 
2949  /* no output because of empty input or only state changes */
2950  *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2951  return 0xffff;
2952 }
2953 
2954 /*
2955  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2956  * conversion without offset handling.
2957  *
2958  * When a character does not have a mapping to Unicode, then we return to the
2959  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2960  * handling.
2961  * We also defer to the generic code in other complicated cases and have them
2962  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2963  *
2964  * All normal mappings and errors are handled here.
2965  */
2966 static UChar32 U_CALLCONV
2968  UErrorCode *pErrorCode) {
2969  UConverter *cnv;
2970  const uint8_t *source, *sourceLimit, *lastSource;
2971 
2972  const int32_t (*stateTable)[256];
2973  const uint16_t *unicodeCodeUnits;
2974 
2975  uint32_t offset;
2976  uint8_t state;
2977 
2978  int32_t entry;
2979  UChar32 c;
2980  uint8_t action;
2981 
2982  /* use optimized function if possible */
2983  cnv=pArgs->converter;
2984 
2985  if(cnv->preToULength>0) {
2986  /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2988  }
2989 
2991  /*
2992  * Using the generic ucnv_getNextUChar() code lets us deal correctly
2993  * with the rare case of a codepage that maps single surrogates
2994  * without adding the complexity to this already complicated function here.
2995  */
2997  } else if(cnv->sharedData->mbcs.countStates==1) {
2998  return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2999  }
3000 
3001  /* set up the local pointers */
3002  source=lastSource=(const uint8_t *)pArgs->source;
3003  sourceLimit=(const uint8_t *)pArgs->sourceLimit;
3004 
3005  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3006  stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
3007  } else {
3008  stateTable=cnv->sharedData->mbcs.stateTable;
3009  }
3010  unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
3011 
3012  /* get the converter state from UConverter */
3014 
3015  /*
3016  * if we are in the SBCS state for a DBCS-only converter,
3017  * then load the DBCS state from the MBCS data
3018  * (dbcsOnlyState==0 if it is not a DBCS-only converter)
3019  */
3020  if((state=(uint8_t)(cnv->mode))==0) {
3022  }
3023 
3024  /* conversion loop */
3025  c=U_SENTINEL;
3026  while(source<sourceLimit) {
3027  entry=stateTable[state][*source++];
3031 
3032  /* optimization for 1/2-byte input and BMP output */
3033  if( source<sourceLimit &&
3034  MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
3036  (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
3037  ) {
3038  ++source;
3039  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
3040  /* output BMP code point */
3041  break;
3042  }
3043  } else {
3044  /* save the previous state for proper extension mapping with SI/SO-stateful converters */
3045  cnv->mode=state;
3046 
3047  /* set the next state early so that we can reuse the entry variable */
3048  state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
3049 
3050  /*
3051  * An if-else-if chain provides more reliable performance for
3052  * the most common cases compared to a switch.
3053  */
3056  /* output BMP code point */
3058  break;
3059  } else if(action==MBCS_STATE_VALID_16) {
3061  c=unicodeCodeUnits[offset];
3062  if(c<0xfffe) {
3063  /* output BMP code point */
3064  break;
3065  } else if(c==0xfffe) {
3067  break;
3068  }
3069  } else {
3070  /* callback(illegal) */
3071  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3072  }
3073  } else if(action==MBCS_STATE_VALID_16_PAIR) {
3075  c=unicodeCodeUnits[offset++];
3076  if(c<0xd800) {
3077  /* output BMP code point below 0xd800 */
3078  break;
3079  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3080  /* output roundtrip or fallback supplementary code point */
3081  c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
3082  break;
3083  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3084  /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3085  c=unicodeCodeUnits[offset];
3086  break;
3087  } else if(c==0xffff) {
3088  /* callback(illegal) */
3089  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3090  }
3091  } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
3093  ) {
3094  /* output supplementary code point */
3095  c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
3096  break;
3097  } else if(action==MBCS_STATE_CHANGE_ONLY) {
3098  /*
3099  * This serves as a state change without any output.
3100  * It is useful for reading simple stateful encodings,
3101  * for example using just Shift-In/Shift-Out codes.
3102  * The 21 unused bits may later be used for more sophisticated
3103  * state transitions.
3104  */
3105  if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
3106  /* SI/SO are illegal for DBCS-only conversion */
3107  state=(uint8_t)(cnv->mode); /* restore the previous state */
3108 
3109  /* callback(illegal) */
3110  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3111  }
3114  /* output BMP code point */
3116  break;
3117  }
3118  } else if(action==MBCS_STATE_UNASSIGNED) {
3119  /* just fall through */
3120  } else if(action==MBCS_STATE_ILLEGAL) {
3121  /* callback(illegal) */
3122  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3123  } else {
3124  /* reserved (must never occur), or only state change */
3125  offset=0;
3126  lastSource=source;
3127  continue;
3128  }
3129 
3130  /* end of action codes: prepare for a new character */
3131  offset=0;
3132 
3133  if(U_FAILURE(*pErrorCode)) {
3134  /* callback(illegal) */
3135  break;
3136  } else /* unassigned sequence */ {
3137  /* defer to the generic implementation */
3138  cnv->toUnicodeStatus=0;
3139  cnv->mode=state;
3140  pArgs->source=(const char *)lastSource;
3142  }
3143  }
3144  }
3145 
3146  if(c<0) {
3147  if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
3148  /* incomplete character byte sequence */
3150  cnv->toULength=(int8_t)(source-lastSource);
3151  do {
3152  *bytes++=*lastSource++;
3153  } while(lastSource<source);
3154  *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3155  } else if(U_FAILURE(*pErrorCode)) {
3156  /* callback(illegal) */
3157  /*
3158  * Ticket 5691: consistent illegal sequences:
3159  * - We include at least the first byte in the illegal sequence.
3160  * - If any of the non-initial bytes could be the start of a character,
3161  * we stop the illegal sequence before the first one of those.
3162  */
3163  UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
3165  *bytes++=*lastSource++; /* first byte */
3166  if(lastSource==source) {
3167  cnv->toULength=1;
3168  } else /* lastSource<source: multi-byte character */ {
3169  int8_t i;
3170  for(i=1;
3171  lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
3172  ++i
3173  ) {
3174  *bytes++=*lastSource++;
3175  }
3176  cnv->toULength=i;
3177  source=lastSource;
3178  }
3179  } else {
3180  /* no output because of empty input or only state changes */
3181  *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
3182  }
3183  c=0xffff;
3184  }
3185 
3186  /* set the converter state back into UConverter, ready for a new character */
3187  cnv->toUnicodeStatus=0;
3188  cnv->mode=state;
3189 
3190  /* write back the updated pointer */
3191  pArgs->source=(const char *)source;
3192  return c;
3193 }
3194 
3195 #if 0
3196 /*
3197  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3198  * Removal improves code coverage.
3199  */
3200 /**
3201  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
3202  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3203  * It does not handle conversion extensions (_extToU()).
3204  */
3207  uint8_t b, UBool useFallback) {
3208  int32_t entry;
3209  uint8_t action;
3210 
3211  entry=sharedData->mbcs.stateTable[0][b];
3212  /* MBCS_ENTRY_IS_FINAL(entry) */
3213 
3215  /* output BMP code point */
3217  }
3218 
3219  /*
3220  * An if-else-if chain provides more reliable performance for
3221  * the most common cases compared to a switch.
3222  */
3225  /* output supplementary code point */
3226  return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3228  if(!TO_U_USE_FALLBACK(useFallback)) {
3229  return 0xfffe;
3230  }
3231  /* output BMP code point */
3234  if(!TO_U_USE_FALLBACK(useFallback)) {
3235  return 0xfffe;
3236  }
3237  /* output supplementary code point */
3238  return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3239  } else if(action==MBCS_STATE_UNASSIGNED) {
3240  return 0xfffe;
3241  } else if(action==MBCS_STATE_ILLEGAL) {
3242  return 0xffff;
3243  } else {
3244  /* reserved, must never occur */
3245  return 0xffff;
3246  }
3247 }
3248 #endif
3249 
3250 /*
3251  * This is a simple version of _MBCSGetNextUChar() that is used
3252  * by other converter implementations.
3253  * It only returns an "assigned" result if it consumes the entire input.
3254  * It does not use state from the converter, nor error codes.
3255  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3256  * It handles conversion extensions but not GB 18030.
3257  *
3258  * Return value:
3259  * U+fffe unassigned
3260  * U+ffff illegal
3261  * otherwise the Unicode code point
3262  */
3265  const char *source, int32_t length,
3266  UBool useFallback) {
3267  const int32_t (*stateTable)[256];
3268  const uint16_t *unicodeCodeUnits;
3269 
3270  uint32_t offset;
3271  uint8_t state, action;
3272 
3273  UChar32 c;
3274  int32_t i, entry;
3275 
3276  if(length<=0) {
3277  /* no input at all: "illegal" */
3278  return 0xffff;
3279  }
3280 
3281 #if 0
3282 /*
3283  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3284  * TODO In future releases, verify that this function is never called for SBCS
3285  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3286  * Removal improves code coverage.
3287  */
3288  /* use optimized function if possible */
3289  if(sharedData->mbcs.countStates==1) {
3290  if(length==1) {
3291  return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3292  } else {
3293  return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3294  }
3295  }
3296 #endif
3297 
3298  /* set up the local pointers */
3299  stateTable=sharedData->mbcs.stateTable;
3300  unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3301 
3302  /* converter state */
3303  offset=0;
3304  state=sharedData->mbcs.dbcsOnlyState;
3305 
3306  /* conversion loop */
3307  for(i=0;;) {
3308  entry=stateTable[state][(uint8_t)source[i++]];
3312 
3313  if(i==length) {
3314  return 0xffff; /* truncated character */
3315  }
3316  } else {
3317  /*
3318  * An if-else-if chain provides more reliable performance for
3319  * the most common cases compared to a switch.
3320  */
3324  c=unicodeCodeUnits[offset];
3325  if(c!=0xfffe) {
3326  /* done */
3327  } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3328  c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3329  /* else done with 0xfffe */
3330  }
3331  break;
3332  } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3333  /* output BMP code point */
3335  break;
3336  } else if(action==MBCS_STATE_VALID_16_PAIR) {
3338  c=unicodeCodeUnits[offset++];
3339  if(c<0xd800) {
3340  /* output BMP code point below 0xd800 */
3341  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3342  /* output roundtrip or fallback supplementary code point */
3343  c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3344  } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3345  /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3346  c=unicodeCodeUnits[offset];
3347  } else if(c==0xffff) {
3348  return 0xffff;
3349  } else {
3350  c=0xfffe;
3351  }
3352  break;
3353  } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3354  /* output supplementary code point */
3355  c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3356  break;
3358  if(!TO_U_USE_FALLBACK(useFallback)) {
3359  c=0xfffe;
3360  break;
3361  }
3362  /* output BMP code point */
3364  break;
3366  if(!TO_U_USE_FALLBACK(useFallback)) {
3367  c=0xfffe;
3368  break;
3369  }
3370  /* output supplementary code point */
3371  c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3372  break;
3373  } else if(action==MBCS_STATE_UNASSIGNED) {
3374  c=0xfffe;
3375  break;
3376  }
3377 
3378  /*
3379  * forbid MBCS_STATE_CHANGE_ONLY for this function,
3380  * and MBCS_STATE_ILLEGAL and reserved action codes
3381  */
3382  return 0xffff;
3383  }
3384  }
3385 
3386  if(i!=length) {
3387  /* illegal for this function: not all input consumed */
3388  return 0xffff;
3389  }
3390 
3391  if(c==0xfffe) {
3392  /* try an extension mapping */
3393  const int32_t *cx=sharedData->mbcs.extIndexes;
3394  if(cx!=NULL) {
3395  return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3396  }
3397  }
3398 
3399  return c;
3400 }
3401 
3402 /* MBCS-from-Unicode conversion functions ----------------------------------- */
3403 
3404 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3405 static void
3407  UErrorCode *pErrorCode) {
3408  UConverter *cnv;
3409  const UChar *source, *sourceLimit;
3410  uint8_t *target;
3411  int32_t targetCapacity;
3412  int32_t *offsets;
3413 
3414  const uint16_t *table;
3415  const uint16_t *mbcsIndex;
3416  const uint8_t *bytes;
3417 
3418  UChar32 c;
3419 
3420  int32_t sourceIndex, nextSourceIndex;
3421 
3422  uint32_t stage2Entry;
3423  uint32_t asciiRoundtrips;
3424  uint32_t value;
3425  uint8_t unicodeMask;
3426 
3427  /* use optimized function if possible */
3428  cnv=pArgs->converter;
3429  unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3430 
3431  /* set up the local pointers */
3432  source=pArgs->source;
3433  sourceLimit=pArgs->sourceLimit;
3434  target=(uint8_t *)pArgs->target;
3435  targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3436  offsets=pArgs->offsets;
3437 
3439  mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3440  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3442  } else {
3444  }
3445  asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3446 
3447  /* get the converter state from UConverter */
3448  c=cnv->fromUChar32;
3449 
3450  /* sourceIndex=-1 if the current character began in the previous buffer */
3451  sourceIndex= c==0 ? 0 : -1;
3452  nextSourceIndex=0;
3453 
3454  /* conversion loop */
3455  if(c!=0 && targetCapacity>0) {
3456  goto getTrail;
3457  }
3458 
3459  while(source<sourceLimit) {
3460  /*
3461  * This following test is to see if available input would overflow the output.
3462  * It does not catch output of more than one byte that
3463  * overflows as a result of a multi-byte character or callback output
3464  * from the last source character.
3465  * Therefore, those situations also test for overflows and will
3466  * then break the loop, too.
3467  */
3468  if(targetCapacity>0) {
3469  /*
3470  * Get a correct Unicode code point:
3471  * a single UChar for a BMP code point or
3472  * a matched surrogate pair for a "supplementary code point".
3473  */
3474  c=*source++;
3475  ++nextSourceIndex;
3476  if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3477  *target++=(uint8_t)c;
3478  if(offsets!=NULL) {
3479  *offsets++=sourceIndex;
3480  sourceIndex=nextSourceIndex;
3481  }
3482  --targetCapacity;
3483  c=0;
3484  continue;
3485  }
3486  /*
3487  * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3488  * to avoid dealing with surrogates.
3489  * MBCS_FAST_MAX must be >=0xd7ff.
3490  */
3491  if(c<=0xd7ff) {
3492  value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3493  /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3494  if(value==0) {
3495  goto unassigned;
3496  }
3497  /* output the value */
3498  } else {
3499  /*
3500  * This also tests if the codepage maps single surrogates.
3501  * If it does, then surrogates are not paired but mapped separately.
3502  * Note that in this case unmatched surrogates are not detected.
3503  */
3504  if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3505  if(U16_IS_SURROGATE_LEAD(c)) {
3506 getTrail:
3507  if(source<sourceLimit) {
3508  /* test the following code unit */
3509  UChar trail=*source;
3510  if(U16_IS_TRAIL(trail)) {
3511  ++source;
3512  ++nextSourceIndex;
3513  c=U16_GET_SUPPLEMENTARY(c, trail);
3514  if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3515  /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3516  /* callback(unassigned) */
3517  goto unassigned;
3518  }
3519  /* convert this supplementary code point */
3520  /* exit this condition tree */
3521  } else {
3522  /* this is an unmatched lead code unit (1st surrogate) */
3523  /* callback(illegal) */
3524  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3525  break;
3526  }
3527  } else {
3528  /* no more input */
3529  break;
3530  }
3531  } else {
3532  /* this is an unmatched trail code unit (2nd surrogate) */
3533  /* callback(illegal) */
3534  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3535  break;
3536  }
3537  }
3538 
3539  /* convert the Unicode code point in c into codepage bytes */
3540  stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3541 
3542  /* get the bytes and the length for the output */
3543  /* MBCS_OUTPUT_2 */
3544  value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3545 
3546  /* is this code point assigned, or do we use fallbacks? */
3547  if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3548  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3549  ) {
3550  /*
3551  * We allow a 0 byte output if the "assigned" bit is set for this entry.
3552  * There is no way with this data structure for fallback output
3553  * to be a zero byte.
3554  */
3555 
3556 unassigned:
3557  /* try an extension mapping */
3558  pArgs->source=source;
3560  c, &source, sourceLimit,
3561  &target, target+targetCapacity,
3562  &offsets, sourceIndex,
3563  pArgs->flush,
3564  pErrorCode);
3565  nextSourceIndex+=(int32_t)(source-pArgs->source);
3566 
3567  if(U_FAILURE(*pErrorCode)) {
3568  /* not mappable or buffer overflow */
3569  break;
3570  } else {
3571  /* a mapping was written to the target, continue */
3572 
3573  /* recalculate the targetCapacity after an extension mapping */
3574  targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3575 
3576  /* normal end of conversion: prepare for a new character */
3577  sourceIndex=nextSourceIndex;
3578  continue;
3579  }
3580  }
3581  }
3582 
3583  /* write the output character bytes from value and length */
3584  /* from the first if in the loop we know that targetCapacity>0 */
3585  if(value<=0xff) {
3586  /* this is easy because we know that there is enough space */
3587  *target++=(uint8_t)value;
3588  if(offsets!=NULL) {
3589  *offsets++=sourceIndex;
3590  }
3591  --targetCapacity;
3592  } else /* length==2 */ {
3593  *target++=(uint8_t)(value>>8);
3594  if(2<=targetCapacity) {
3595  *target++=(uint8_t)value;
3596  if(offsets!=NULL) {
3597  *offsets++=sourceIndex;
3598  *offsets++=sourceIndex;
3599  }
3600  targetCapacity-=2;
3601  } else {
3602  if(offsets!=NULL) {
3603  *offsets++=sourceIndex;
3604  }
3605  cnv->charErrorBuffer[0]=(char)value;
3607 
3608  /* target overflow */
3609  targetCapacity=0;
3610  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3611  c=0;
3612  break;
3613  }
3614  }
3615 
3616  /* normal end of conversion: prepare for a new character */
3617  c=0;
3618  sourceIndex=nextSourceIndex;
3619  continue;
3620  } else {
3621  /* target is full */
3622  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3623  break;
3624  }
3625  }
3626 
3627  /* set the converter state back into UConverter */
3628  cnv->fromUChar32=c;
3629 
3630  /* write back the updated pointers */
3631  pArgs->source=source;
3632  pArgs->target=(char *)target;
3633  pArgs->offsets=offsets;
3634 }
3635 
3636 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3637 static void
3639  UErrorCode *pErrorCode) {
3640  UConverter *cnv;
3641  const UChar *source, *sourceLimit;
3642  uint8_t *target;
3643  int32_t targetCapacity;
3644  int32_t *offsets;
3645 
3646  const uint16_t *table;
3647  const uint16_t *results;
3648 
3649  UChar32 c;
3650 
3651  int32_t sourceIndex, nextSourceIndex;
3652 
3653  uint16_t value, minValue;
3654  UBool hasSupplementary;
3655 
3656  /* set up the local pointers */
3657  cnv=pArgs->converter;
3658  source=pArgs->source;
3659  sourceLimit=pArgs->sourceLimit;
3660  target=(uint8_t *)pArgs->target;
3661  targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3662  offsets=pArgs->offsets;
3663 
3665  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3667  } else {
3669  }
3670 
3671  if(cnv->useFallback) {
3672  /* use all roundtrip and fallback results */
3673  minValue=0x800;
3674  } else {
3675  /* use only roundtrips and fallbacks from private-use characters */
3676  minValue=0xc00;
3677  }
3679 
3680  /* get the converter state from UConverter */
3681  c=cnv->fromUChar32;
3682 
3683  /* sourceIndex=-1 if the current character began in the previous buffer */
3684  sourceIndex= c==0 ? 0 : -1;
3685  nextSourceIndex=0;
3686 
3687  /* conversion loop */
3688  if(c!=0 && targetCapacity>0) {
3689  goto getTrail;
3690  }
3691 
3692  while(source<sourceLimit) {
3693  /*
3694  * This following test is to see if available input would overflow the output.
3695  * It does not catch output of more than one byte that
3696  * overflows as a result of a multi-byte character or callback output
3697  * from the last source character.
3698  * Therefore, those situations also test for overflows and will
3699  * then break the loop, too.
3700  */
3701  if(targetCapacity>0) {
3702  /*
3703  * Get a correct Unicode code point:
3704  * a single UChar for a BMP code point or
3705  * a matched surrogate pair for a "supplementary code point".
3706  */
3707  c=*source++;
3708  ++nextSourceIndex;
3709  if(U16_IS_SURROGATE(c)) {
3710  if(U16_IS_SURROGATE_LEAD(c)) {
3711 getTrail:
3712  if(source<sourceLimit) {
3713  /* test the following code unit */
3714  UChar trail=*source;
3715  if(U16_IS_TRAIL(trail)) {
3716  ++source;
3717  ++nextSourceIndex;
3718  c=U16_GET_SUPPLEMENTARY(c, trail);
3719  if(!hasSupplementary) {
3720  /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3721  /* callback(unassigned) */
3722  goto unassigned;
3723  }
3724  /* convert this supplementary code point */
3725  /* exit this condition tree */
3726  } else {
3727  /* this is an unmatched lead code unit (1st surrogate) */
3728  /* callback(illegal) */
3729  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3730  break;
3731  }
3732  } else {
3733  /* no more input */
3734  break;
3735  }
3736  } else {
3737  /* this is an unmatched trail code unit (2nd surrogate) */
3738  /* callback(illegal) */
3739  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3740  break;
3741  }
3742  }
3743 
3744  /* convert the Unicode code point in c into codepage bytes */
3746 
3747  /* is this code point assigned, or do we use fallbacks? */
3748  if(value>=minValue) {
3749  /* assigned, write the output character bytes from value and length */
3750  /* length==1 */
3751  /* this is easy because we know that there is enough space */
3752  *target++=(uint8_t)value;
3753  if(offsets!=NULL) {
3754  *offsets++=sourceIndex;
3755  }
3756  --targetCapacity;
3757 
3758  /* normal end of conversion: prepare for a new character */
3759  c=0;
3760  sourceIndex=nextSourceIndex;
3761  } else { /* unassigned */
3762 unassigned:
3763  /* try an extension mapping */
3764  pArgs->source=source;
3766  c, &source, sourceLimit,
3767  &target, target+targetCapacity,
3768  &offsets, sourceIndex,
3769  pArgs->flush,
3770  pErrorCode);
3771  nextSourceIndex+=(int32_t)(source-pArgs->source);
3772 
3773  if(U_FAILURE(*pErrorCode)) {
3774  /* not mappable or buffer overflow */
3775  break;
3776  } else {
3777  /* a mapping was written to the target, continue */
3778 
3779  /* recalculate the targetCapacity after an extension mapping */
3780  targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3781 
3782  /* normal end of conversion: prepare for a new character */
3783  sourceIndex=nextSourceIndex;
3784  }
3785  }
3786  } else {
3787  /* target is full */
3788  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3789  break;
3790  }
3791  }
3792 
3793  /* set the converter state back into UConverter */
3794  cnv->fromUChar32=c;
3795 
3796  /* write back the updated pointers */
3797  pArgs->source=source;
3798  pArgs->target=(char *)target;
3799  pArgs->offsets=offsets;
3800 }
3801 
3802 /*
3803  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3804  * that map only to and from the BMP.
3805  * In addition to single-byte/state optimizations, the offset calculations
3806  * become much easier.
3807  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3808  * but measurements have shown that this diminishes performance
3809  * in more cases than it improves it.
3810  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3811  * for various MBCS and SBCS optimizations.
3812  */
3813 static void
3815  UErrorCode *pErrorCode) {
3816  UConverter *cnv;
3817  const UChar *source, *sourceLimit, *lastSource;
3818  uint8_t *target;
3819  int32_t targetCapacity, length;
3820  int32_t *offsets;
3821 
3822  const uint16_t *table;
3823  const uint16_t *results;
3824 
3825  UChar32 c;
3826 
3827  int32_t sourceIndex;
3828 
3829  uint32_t asciiRoundtrips;
3830  uint16_t value, minValue;
3831 
3832  /* set up the local pointers */
3833  cnv=pArgs->converter;
3834  source=pArgs->source;
3835  sourceLimit=pArgs->sourceLimit;
3836  target=(uint8_t *)pArgs->target;
3837  targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3838  offsets=pArgs->offsets;
3839 
3841  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3843  } else {
3845  }
3846  asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3847 
3848  if(cnv->useFallback) {
3849  /* use all roundtrip and fallback results */
3850  minValue=0x800;
3851  } else {
3852  /* use only roundtrips and fallbacks from private-use characters */
3853  minValue=0xc00;
3854  }
3855 
3856  /* get the converter state from UConverter */
3857  c=cnv->fromUChar32;
3858 
3859  /* sourceIndex=-1 if the current character began in the previous buffer */
3860  sourceIndex= c==0 ? 0 : -1;
3861  lastSource=source;
3862 
3863  /*
3864  * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3865  * for the minimum of the sourceLength and targetCapacity
3866  */
3867  length=(int32_t)(sourceLimit-source);
3868  if(length<targetCapacity) {
3869  targetCapacity=length;
3870  }
3871 
3872  /* conversion loop */
3873  if(c!=0 && targetCapacity>0) {
3874  goto getTrail;
3875  }
3876 
3877 #if MBCS_UNROLL_SINGLE_FROM_BMP
3878  /* unrolling makes it slower on Pentium III/Windows 2000?! */
3879  /* unroll the loop with the most common case */
3880 unrolled:
3881  if(targetCapacity>=4) {
3882  int32_t count, loops;
3883  uint16_t andedValues;
3884 
3885  loops=count=targetCapacity>>2;
3886  do {
3887  c=*source++;
3889  *target++=(uint8_t)value;
3890  c=*source++;
3891  andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3892  *target++=(uint8_t)value;
3893  c=*source++;
3894  andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3895  *target++=(uint8_t)value;
3896  c=*source++;
3897  andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3898  *target++=(uint8_t)value;
3899 
3900  /* were all 4 entries really valid? */
3901  if(andedValues<minValue) {
3902  /* no, return to the first of these 4 */
3903  source-=4;
3904  target-=4;
3905  break;
3906  }
3907  } while(--count>0);
3908  count=loops-count;
3909  targetCapacity-=4*count;
3910 
3911  if(offsets!=NULL) {
3912  lastSource+=4*count;
3913  while(count>0) {
3914  *offsets++=sourceIndex++;
3915  *offsets++=sourceIndex++;
3916  *offsets++=sourceIndex++;
3917  *offsets++=sourceIndex++;
3918  --count;
3919  }
3920  }
3921 
3922  c=0;
3923  }
3924 #endif
3925 
3926  while(targetCapacity>0) {
3927  /*
3928  * Get a correct Unicode code point:
3929  * a single UChar for a BMP code point or
3930  * a matched surrogate pair for a "supplementary code point".
3931  */
3932  c=*source++;
3933  /*
3934  * Do not immediately check for single surrogates:
3935  * Assume that they are unassigned and check for them in that case.
3936  * This speeds up the conversion of assigned characters.
3937  */
3938  /* convert the Unicode code point in c into codepage bytes */
3939  if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3940  *target++=(uint8_t)c;
3941  --targetCapacity;
3942  c=0;
3943  continue;
3944  }
3946  /* is this code point assigned, or do we use fallbacks? */
3947  if(value>=minValue) {
3948  /* assigned, write the output character bytes from value and length */
3949  /* length==1 */
3950  /* this is easy because we know that there is enough space */
3951  *target++=(uint8_t)value;
3952  --targetCapacity;
3953 
3954  /* normal end of conversion: prepare for a new character */
3955  c=0;
3956  continue;
3957  } else if(!U16_IS_SURROGATE(c)) {
3958  /* normal, unassigned BMP character */
3959  } else if(U16_IS_SURROGATE_LEAD(c)) {
3960 getTrail:
3961  if(source<sourceLimit) {
3962  /* test the following code unit */
3963  UChar trail=*source;
3964  if(U16_IS_TRAIL(trail)) {
3965  ++source;
3966  c=U16_GET_SUPPLEMENTARY(c, trail);
3967  /* this codepage does not map supplementary code points */
3968  /* callback(unassigned) */
3969  } else {
3970  /* this is an unmatched lead code unit (1st surrogate) */
3971  /* callback(illegal) */
3972  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3973  break;
3974  }
3975  } else {
3976  /* no more input */
3977  if (pArgs->flush) {
3978  *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3979  }
3980  break;
3981  }
3982  } else {
3983  /* this is an unmatched trail code unit (2nd surrogate) */
3984  /* callback(illegal) */
3985  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3986  break;
3987  }
3988 
3989  /* c does not have a mapping */
3990 
3991  /* get the number of code units for c to correctly advance sourceIndex */
3992  length=U16_LENGTH(c);
3993 
3994  /* set offsets since the start or the last extension */
3995  if(offsets!=NULL) {
3996  int32_t count=(int32_t)(source-lastSource);
3997 
3998  /* do not set the offset for this character */
3999  count-=length;
4000 
4001  while(count>0) {
4002  *offsets++=sourceIndex++;
4003  --count;
4004  }
4005  /* offsets and sourceIndex are now set for the current character */
4006  }
4007 
4008  /* try an extension mapping */
4009  lastSource=source;
4011  c, &source, sourceLimit,
4012  &target, (const uint8_t *)(pArgs->targetLimit),
4013  &offsets, sourceIndex,
4014  pArgs->flush,
4015  pErrorCode);
4016  sourceIndex+=length+(int32_t)(source-lastSource);
4017  lastSource=source;
4018 
4019  if(U_FAILURE(*pErrorCode)) {
4020  /* not mappable or buffer overflow */
4021  break;
4022  } else {
4023  /* a mapping was written to the target, continue */
4024 
4025  /* recalculate the targetCapacity after an extension mapping */
4026  targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4027  length=(int32_t)(sourceLimit-source);
4028  if(length<targetCapacity) {
4029  targetCapacity=length;
4030  }
4031  }
4032 
4033 #if MBCS_UNROLL_SINGLE_FROM_BMP
4034  /* unrolling makes it slower on Pentium III/Windows 2000?! */
4035  goto unrolled;
4036 #endif
4037  }
4038 
4039  if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
4040  /* target is full */
4041  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4042  }
4043 
4044  /* set offsets since the start or the last callback */
4045  if(offsets!=NULL) {
4046  size_t count=source-lastSource;
4047  if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
4048  /*
4049  Caller gave us a partial supplementary character,
4050  which this function couldn't convert in any case.
4051  The callback will handle the offset.
4052  */
4053  count--;
4054  }
4055  while(count>0) {
4056  *offsets++=sourceIndex++;
4057  --count;
4058  }
4059  }
4060 
4061  /* set the converter state back into UConverter */
4062  cnv->fromUChar32=c;
4063 
4064  /* write back the updated pointers */
4065  pArgs->source=source;
4066  pArgs->target=(char *)target;
4067  pArgs->offsets=offsets;
4068 }
4069 
4070 U_CFUNC void
4072  UErrorCode *pErrorCode) {
4073  UConverter *cnv;
4074  const UChar *source, *sourceLimit;
4075  uint8_t *target;
4076  int32_t targetCapacity;
4077  int32_t *offsets;
4078 
4079  const uint16_t *table;
4080  const uint16_t *mbcsIndex;
4081  const uint8_t *p, *bytes;
4082  uint8_t outputType;
4083 
4084  UChar32 c;
4085 
4086  int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
4087 
4088  uint32_t stage2Entry;
4089  uint32_t asciiRoundtrips;
4090  uint32_t value;
4091  /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
4092  uint8_t siBytes[2] = {0, 0};
4093  uint8_t soBytes[2] = {0, 0};
4094  uint8_t siLength, soLength;
4095  int32_t length = 0, prevLength;
4096  uint8_t unicodeMask;
4097 
4098  cnv=pArgs->converter;
4099 
4100  if(cnv->preFromUFirstCP>=0) {
4101  /*
4102  * pass sourceIndex=-1 because we continue from an earlier buffer
4103  * in the future, this may change with continuous offsets
4104  */
4105  ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
4106 
4107  if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
4108  return;
4109  }
4110  }
4111 
4112  /* use optimized function if possible */
4113  outputType=cnv->sharedData->mbcs.outputType;
4114  unicodeMask=cnv->sharedData->mbcs.unicodeMask;
4115  if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4116  if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4117  ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
4118  } else {
4119  ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
4120  }
4121  return;
4122  } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
4123  ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
4124  return;
4125  }
4126 
4127  /* set up the local pointers */
4128  source=pArgs->source;
4129  sourceLimit=pArgs->sourceLimit;
4130  target=(uint8_t *)pArgs->target;
4131  targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
4132  offsets=pArgs->offsets;
4133 
4136  mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
4137  } else {
4138  mbcsIndex=NULL;
4139  }
4140  if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4142  } else {
4144  }
4145  asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4146 
4147  /* get the converter state from UConverter */
4148  c=cnv->fromUChar32;
4149 
4150  if(outputType==MBCS_OUTPUT_2_SISO) {
4151  prevLength=cnv->fromUnicodeStatus;
4152  if(prevLength==0) {
4153  /* set the real value */
4154  prevLength=1;
4155  }
4156  } else {
4157  /* prevent fromUnicodeStatus from being set to something non-0 */
4158  prevLength=0;
4159  }
4160 
4161  /* sourceIndex=-1 if the current character began in the previous buffer */
4162  prevSourceIndex=-1;
4163  sourceIndex= c==0 ? 0 : -1;
4164  nextSourceIndex=0;
4165 
4166  /* Get the SI/SO character for the converter */
4167  siLength = static_cast<uint8_t>(getSISOBytes(SI, cnv->options, siBytes));
4168  soLength = static_cast<uint8_t>(getSISOBytes(SO, cnv->options, soBytes));
4169 
4170  /* conversion loop */
4171  /*
4172  * This is another piece of ugly code:
4173  * A goto into the loop if the converter state contains a first surrogate
4174  * from the previous function call.
4175  * It saves me to check in each loop iteration a check of if(c==0)
4176  * and duplicating the trail-surrogate-handling code in the else
4177  * branch of that check.
4178  * I could not find any other way to get around this other than
4179  * using a function call for the conversion and callback, which would
4180  * be even more inefficient.
4181  *
4182  * Markus Scherer 2000-jul-19
4183  */
4184  if(c!=0 && targetCapacity>0) {
4185  goto getTrail;
4186  }
4187 
4188  while(source<sourceLimit) {
4189  /*
4190  * This following test is to see if available input would overflow the output.
4191  * It does not catch output of more than one byte that
4192  * overflows as a result of a multi-byte character or callback output
4193  * from the last source character.
4194  * Therefore, those situations also test for overflows and will
4195  * then break the loop, too.
4196  */
4197  if(targetCapacity>0) {
4198  /*
4199  * Get a correct Unicode code point:
4200  * a single UChar for a BMP code point or
4201  * a matched surrogate pair for a "supplementary code point".
4202  */
4203  c=*source++;
4204  ++nextSourceIndex;
4205  if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
4206  *target++=(uint8_t)c;
4207  if(offsets!=NULL) {
4208  *offsets++=sourceIndex;
4209  prevSourceIndex=sourceIndex;
4210  sourceIndex=nextSourceIndex;
4211  }
4212  --targetCapacity;
4213  c=0;
4214  continue;
4215  }
4216  /*
4217  * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
4218  * to avoid dealing with surrogates.
4219  * MBCS_FAST_MAX must be >=0xd7ff.
4220  */
4221  if(c<=0xd7ff && mbcsIndex!=NULL) {
4222  value=mbcsIndex[c>>6];
4223 
4224  /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
4225  /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
4226  switch(outputType) {
4227  case MBCS_OUTPUT_2:
4228  value=((const uint16_t *)bytes)[value +(c&0x3f)];
4229  if(value<=0xff) {
4230  if(value==0) {
4231  goto unassigned;
4232  } else {
4233  length=1;
4234  }
4235  } else {
4236  length=2;
4237  }
4238  break;
4239  case MBCS_OUTPUT_2_SISO:
4240  /* 1/2-byte stateful with Shift-In/Shift-Out */
4241  /*
4242  * Save the old state in the converter object
4243  * right here, then change the local prevLength state variable if necessary.
4244  * Then, if this character turns out to be unassigned or a fallback that
4245  * is not taken, the callback code must not save the new state in the converter
4246  * because the new state is for a character that is not output.
4247  * However, the callback must still restore the state from the converter
4248  * in case the callback function changed it for its output.
4249  */
4250  cnv->fromUnicodeStatus=prevLength; /* save the old state */
4251  value=((const uint16_t *)bytes)[value +(c&0x3f)];
4252  if(value<=0xff) {
4253  if(value==0) {
4254  goto unassigned;
4255  } else if(prevLength<=1) {
4256  length=1;
4257  } else {
4258  /* change from double-byte mode to single-byte */
4259  if (siLength == 1) {
4260  value|=(uint32_t)siBytes[0]<<8;
4261  length = 2;
4262  } else if (siLength == 2) {
4263  value|=(uint32_t)siBytes[1]<<8;
4264  value|=(uint32_t)siBytes[0]<<16;
4265  length = 3;
4266  }
4267  prevLength=1;
4268  }
4269  } else {
4270  if(prevLength==2) {
4271  length=2;
4272  } else {
4273  /* change from single-byte mode to double-byte */
4274  if (soLength == 1) {
4275  value|=(uint32_t)soBytes[0]<<16;
4276  length = 3;
4277  } else if (soLength == 2) {
4278  value|=(uint32_t)soBytes[1]<<16;
4279  value|=(uint32_t)soBytes[0]<<24;
4280  length = 4;
4281  }
4282  prevLength=2;
4283  }
4284  }
4285  break;
4286  case MBCS_OUTPUT_DBCS_ONLY:
4287  /* table with single-byte results, but only DBCS mappings used */
4288  value=((const uint16_t *)bytes)[value +(c&0x3f)];
4289  if(value<=0xff) {
4290  /* no mapping or SBCS result, not taken for DBCS-only */
4291  goto unassigned;
4292  } else {
4293  length=2;
4294  }
4295  break;
4296  case MBCS_OUTPUT_3:
4297  p=bytes+(value+(c&0x3f))*3;
4298  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4299  if(value<=0xff) {
4300  if(value==0) {
4301  goto unassigned;
4302  } else {
4303  length=1;
4304  }
4305  } else if(value<=0xffff) {
4306  length=2;
4307  } else {
4308  length=3;
4309  }
4310  break;
4311  case MBCS_OUTPUT_4:
4312  value=((const uint32_t *)bytes)[value +(c&0x3f)];
4313  if(value<=0xff) {
4314  if(value==0) {
4315  goto unassigned;
4316  } else {
4317  length=1;
4318  }
4319  } else if(value<=0xffff) {
4320  length=2;
4321  } else if(value<=0xffffff) {
4322  length=3;
4323  } else {
4324  length=4;
4325  }
4326  break;
4327  case MBCS_OUTPUT_3_EUC:
4328  value=((const uint16_t *)bytes)[value +(c&0x3f)];
4329  /* EUC 16-bit fixed-length representation */
4330  if(value<=0xff) {
4331  if(value==0) {
4332  goto unassigned;
4333  } else {
4334  length=1;
4335  }
4336  } else if((value&0x8000)==0) {
4337  value|=0x8e8000;
4338  length=3;
4339  } else if((value&0x80)==0) {
4340  value|=0x8f0080;
4341  length=3;
4342  } else {
4343  length=2;
4344  }
4345  break;
4346  case MBCS_OUTPUT_4_EUC:
4347  p=bytes+(value+(c&0x3f))*3;
4348  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4349  /* EUC 16-bit fixed-length representation applied to the first two bytes */
4350  if(value<=0xff) {
4351  if(value==0) {
4352  goto unassigned;
4353  } else {
4354  length=1;
4355  }
4356  } else if(value<=0xffff) {
4357  length=2;
4358  } else if((value&0x800000)==0) {
4359  value|=0x8e800000;
4360  length=4;
4361  } else if((value&0x8000)==0) {
4362  value|=0x8f008000;
4363  length=4;
4364  } else {
4365  length=3;
4366  }
4367  break;
4368  default:
4369  /* must not occur */
4370  /*
4371  * To avoid compiler warnings that value & length may be
4372  * used without having been initialized, we set them here.
4373  * In reality, this is unreachable code.
4374  * Not having a default branch also causes warnings with
4375  * some compilers.
4376  */
4377  value=0;
4378  length=0;
4379  break;
4380  }
4381  /* output the value */
4382  } else {
4383  /*
4384  * This also tests if the codepage maps single surrogates.
4385  * If it does, then surrogates are not paired but mapped separately.
4386  * Note that in this case unmatched surrogates are not detected.
4387  */
4388  if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4389  if(U16_IS_SURROGATE_LEAD(c)) {
4390 getTrail:
4391  if(source<sourceLimit) {
4392  /* test the following code unit */
4393  UChar trail=*source;
4394  if(U16_IS_TRAIL(trail)) {
4395  ++source;
4396  ++nextSourceIndex;
4397  c=U16_GET_SUPPLEMENTARY(c, trail);
4398  if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4399  /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4400  cnv->fromUnicodeStatus=prevLength; /* save the old state */
4401  /* callback(unassigned) */
4402  goto unassigned;
4403  }
4404  /* convert this supplementary code point */
4405  /* exit this condition tree */
4406  } else {
4407  /* this is an unmatched lead code unit (1st surrogate) */
4408  /* callback(illegal) */
4409  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4410  break;
4411  }
4412  } else {
4413  /* no more input */
4414  break;
4415  }
4416  } else {
4417  /* this is an unmatched trail code unit (2nd surrogate) */
4418  /* callback(illegal) */
4419  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4420  break;
4421  }
4422  }
4423 
4424  /* convert the Unicode code point in c into codepage bytes */
4425 
4426  /*
4427  * The basic lookup is a triple-stage compact array (trie) lookup.
4428  * For details see the beginning of this file.
4429  *
4430  * Single-byte codepages are handled with a different data structure
4431  * by _MBCSSingle... functions.
4432  *
4433  * The result consists of a 32-bit value from stage 2 and
4434  * a pointer to as many bytes as are stored per character.
4435  * The pointer points to the character's bytes in stage 3.
4436  * Bits 15..0 of the stage 2 entry contain the stage 3 index
4437  * for that pointer, while bits 31..16 are flags for which of
4438  * the 16 characters in the block are roundtrip-assigned.
4439  *
4440  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4441  * respectively as uint32_t, in the platform encoding.
4442  * For 3-byte codepages, the bytes are always stored in big-endian order.
4443  *
4444  * For EUC encodings that use only either 0x8e or 0x8f as the first
4445  * byte of their longest byte sequences, the first two bytes in
4446  * this third stage indicate with their 7th bits whether these bytes
4447  * are to be written directly or actually need to be preceeded by
4448  * one of the two Single-Shift codes. With this, the third stage
4449  * stores one byte fewer per character than the actual maximum length of
4450  * EUC byte sequences.
4451  *
4452  * Other than that, leading zero bytes are removed and the other
4453  * bytes output. A single zero byte may be output if the "assigned"
4454  * bit in stage 2 was on.
4455  * The data structure does not support zero byte output as a fallback,
4456  * and also does not allow output of leading zeros.
4457  */
4458  stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4459 
4460  /* get the bytes and the length for the output */
4461  switch(outputType) {
4462  case MBCS_OUTPUT_2:
4463  value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4464  if(value<=0xff) {
4465  length=1;
4466  } else {
4467  length=2;
4468  }
4469  break;
4470  case MBCS_OUTPUT_2_SISO:
4471  /* 1/2-byte stateful with Shift-In/Shift-Out */
4472  /*
4473  * Save the old state in the converter object
4474  * right here, then change the local prevLength state variable if necessary.
4475  * Then, if this character turns out to be unassigned or a fallback that
4476  * is not taken, the callback code must not save the new state in the converter
4477  * because the new state is for a character that is not output.
4478  * However, the callback must still restore the state from the converter
4479  * in case the callback function changed it for its output.
4480  */
4481  cnv->fromUnicodeStatus=prevLength; /* save the old state */
4482  value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4483  if(value<=0xff) {
4484  if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4485  /* no mapping, leave value==0 */
4486  length=0;
4487  } else if(prevLength<=1) {
4488  length=1;
4489  } else {
4490  /* change from double-byte mode to single-byte */
4491  if (siLength == 1) {
4492  value|=(uint32_t)siBytes[0]<<8;
4493  length = 2;
4494  } else if (siLength == 2) {
4495  value|=(uint32_t)siBytes[1]<<8;
4496  value|=(uint32_t)siBytes[0]<<16;
4497  length = 3;
4498  }
4499  prevLength=1;
4500  }
4501  } else {
4502  if(prevLength==2) {
4503  length=2;
4504  } else {
4505  /* change from single-byte mode to double-byte */
4506  if (soLength == 1) {
4507  value|=(uint32_t)soBytes[0]<<16;
4508  length = 3;
4509  } else if (soLength == 2) {
4510  value|=(uint32_t)soBytes[1]<<16;
4511  value|=(uint32_t)soBytes[0]<<24;
4512  length = 4;
4513  }
4514  prevLength=2;
4515  }
4516  }
4517  break;
4518  case MBCS_OUTPUT_DBCS_ONLY:
4519  /* table with single-byte results, but only DBCS mappings used */
4520  value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4521  if(value<=0xff) {
4522  /* no mapping or SBCS result, not taken for DBCS-only */
4523  value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4524  length=0;
4525  } else {
4526  length=2;
4527  }
4528  break;
4529  case MBCS_OUTPUT_3:
4530  p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4531  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4532  if(value<=0xff) {
4533  length=1;
4534  } else if(value<=0xffff) {
4535  length=2;
4536  } else {
4537  length=3;
4538  }
4539  break;
4540  case MBCS_OUTPUT_4:
4541  value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4542  if(value<=0xff) {
4543  length=1;
4544  } else if(value<=0xffff) {
4545  length=2;
4546  } else if(value<=0xffffff) {
4547  length=3;
4548  } else {
4549  length=4;
4550  }
4551  break;
4552  case MBCS_OUTPUT_3_EUC:
4553  value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4554  /* EUC 16-bit fixed-length representation */
4555  if(value<=0xff) {
4556  length=1;
4557  } else if((value&0x8000)==0) {
4558  value|=0x8e8000;
4559  length=3;
4560  } else if((value&0x80)==0) {
4561  value|=0x8f0080;
4562  length=3;
4563  } else {
4564  length=2;
4565  }
4566  break;
4567  case MBCS_OUTPUT_4_EUC:
4568  p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4569  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4570  /* EUC 16-bit fixed-length representation applied to the first two bytes */
4571  if(value<=0xff) {
4572  length=1;
4573  } else if(value<=0xffff) {
4574  length=2;
4575  } else if((value&0x800000)==0) {
4576  value|=0x8e800000;
4577  length=4;
4578  } else if((value&0x8000)==0) {
4579  value|=0x8f008000;
4580  length=4;
4581  } else {
4582  length=3;
4583  }
4584  break;
4585  default:
4586  /* must not occur */
4587  /*
4588  * To avoid compiler warnings that value & length may be
4589  * used without having been initialized, we set them here.
4590  * In reality, this is unreachable code.
4591  * Not having a default branch also causes warnings with
4592  * some compilers.
4593  */
4594  value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4595  length=0;
4596  break;
4597  }
4598 
4599  /* is this code point assigned, or do we use fallbacks? */
4600  if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4601  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4602  ) {
4603  /*
4604  * We allow a 0 byte output if the "assigned" bit is set for this entry.
4605  * There is no way with this data structure for fallback output
4606  * to be a zero byte.
4607  */
4608 
4609 unassigned:
4610  /* try an extension mapping */
4611  pArgs->source=source;
4613  c, &source, sourceLimit,
4614  &target, target+targetCapacity,
4615  &offsets, sourceIndex,
4616  pArgs->flush,
4617  pErrorCode);
4618  nextSourceIndex+=(int32_t)(source-pArgs->source);
4619  prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4620 
4621  if(U_FAILURE(*pErrorCode)) {
4622  /* not mappable or buffer overflow */
4623  break;
4624  } else {
4625  /* a mapping was written to the target, continue */
4626 
4627  /* recalculate the targetCapacity after an extension mapping */
4628  targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4629 
4630  /* normal end of conversion: prepare for a new character */
4631  if(offsets!=NULL) {
4632  prevSourceIndex=sourceIndex;
4633  sourceIndex=nextSourceIndex;
4634  }
4635  continue;
4636  }
4637  }
4638  }
4639 
4640  /* write the output character bytes from value and length */
4641  /* from the first if in the loop we know that targetCapacity>0 */
4642  if(length<=targetCapacity) {
4643  if(offsets==NULL) {
4644  switch(length) {
4645  /* each branch falls through to the next one */
4646  case 4:
4647  *target++=(uint8_t)(value>>24);
4648  U_FALLTHROUGH;
4649  case 3:
4650  *target++=(uint8_t)(value>>16);
4651  U_FALLTHROUGH;
4652  case 2:
4653  *target++=(uint8_t)(value>>8);
4654  U_FALLTHROUGH;
4655  case 1:
4656  *target++=(uint8_t)value;
4657  U_FALLTHROUGH;
4658  default:
4659  /* will never occur */
4660  break;
4661  }
4662  } else {
4663  switch(length) {
4664  /* each branch falls through to the next one */
4665  case 4:
4666  *target++=(uint8_t)(value>>24);
4667  *offsets++=sourceIndex;
4668  U_FALLTHROUGH;
4669  case 3:
4670  *target++=(uint8_t)(value>>16);
4671  *offsets++=sourceIndex;
4672  U_FALLTHROUGH;
4673  case 2:
4674  *target++=(uint8_t)(value>>8);
4675  *offsets++=sourceIndex;
4676  U_FALLTHROUGH;
4677  case 1:
4678  *target++=(uint8_t)value;
4679  *offsets++=sourceIndex;
4680  U_FALLTHROUGH;
4681  default:
4682  /* will never occur */
4683  break;
4684  }
4685  }
4686  targetCapacity-=length;
4687  } else {
4688  uint8_t *charErrorBuffer;
4689 
4690  /*
4691  * We actually do this backwards here:
4692  * In order to save an intermediate variable, we output
4693  * first to the overflow buffer what does not fit into the
4694  * regular target.
4695  */
4696  /* we know that 1<=targetCapacity<length<=4 */
4697  length-=targetCapacity;
4698  charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4699  switch(length) {
4700  /* each branch falls through to the next one */
4701  case 3:
4702  *charErrorBuffer++=(uint8_t)(value>>16);
4703  U_FALLTHROUGH;
4704  case 2:
4705  *charErrorBuffer++=(uint8_t)(value>>8);
4706  U_FALLTHROUGH;
4707  case 1:
4708  *charErrorBuffer=(uint8_t)value;
4709  U_FALLTHROUGH;
4710  default:
4711  /* will never occur */
4712  break;
4713  }
4715 
4716  /* now output what fits into the regular target */
4717  value>>=8*length; /* length was reduced by targetCapacity */
4718  switch(targetCapacity) {
4719  /* each branch falls through to the next one */
4720  case 3:
4721  *target++=(uint8_t)(value>>16);
4722  if(offsets!=NULL) {
4723  *offsets++=sourceIndex;
4724  }
4725  U_FALLTHROUGH;
4726  case 2:
4727  *target++=(uint8_t)(value>>8);
4728  if(offsets!=NULL) {
4729  *offsets++=sourceIndex;
4730  }
4731  U_FALLTHROUGH;
4732  case 1:
4733  *target++=(uint8_t)value;
4734  if(offsets!=NULL) {
4735  *offsets++=sourceIndex;
4736  }
4737  U_FALLTHROUGH;
4738  default:
4739  /* will never occur */
4740  break;
4741  }
4742 
4743  /* target overflow */
4744  targetCapacity=0;
4745  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4746  c=0;
4747  break;
4748  }
4749 
4750  /* normal end of conversion: prepare for a new character */
4751  c=0;
4752  if(offsets!=NULL) {
4753  prevSourceIndex=sourceIndex;
4754  sourceIndex=nextSourceIndex;
4755  }
4756  continue;
4757  } else {
4758  /* target is full */
4759  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4760  break;
4761  }
4762  }
4763 
4764  /*
4765  * the end of the input stream and detection of truncated input
4766  * are handled by the framework, but for EBCDIC_STATEFUL conversion
4767  * we need to emit an SI at the very end
4768  *
4769  * conditions:
4770  * successful
4771  * EBCDIC_STATEFUL in DBCS mode
4772  * end of input and no truncated input
4773  */
4774  if( U_SUCCESS(*pErrorCode) &&
4775  outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4776  pArgs->flush && source>=sourceLimit && c==0
4777  ) {
4778  /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4779  if(targetCapacity>0) {
4780  *target++=(uint8_t)siBytes[0];
4781  if (siLength == 2) {
4782  if (targetCapacity<2) {
4783  cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
4785  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4786  } else {
4787  *target++=(uint8_t)siBytes[1];
4788  }
4789  }
4790  if(offsets!=NULL) {
4791  /* set the last source character's index (sourceIndex points at sourceLimit now) */
4792  *offsets++=prevSourceIndex;
4793  }
4794  } else {
4795  /* target is full */
4796  cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
4797  if (siLength == 2) {
4798  cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
4799  }
4800  cnv->charErrorBufferLength=siLength;
4801  *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4802  }
4803  prevLength=1; /* we switched into SBCS */
4804  }
4805 
4806  /* set the converter state back into UConverter */
4807  cnv->fromUChar32=c;
4808  cnv->fromUnicodeStatus=prevLength;
4809 
4810  /* write back the updated pointers */
4811  pArgs->source=source;
4812  pArgs->target=(char *)target;
4813  pArgs->offsets=offsets;
4814 }
4815 
4816 /*
4817  * This is another simple conversion function for internal use by other
4818  * conversion implementations.
4819  * It does not use the converter state nor call callbacks.
4820  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4821  * It handles conversion extensions but not GB 18030.
4822  *
4823  * It converts one single Unicode code point into codepage bytes, encoded
4824  * as one 32-bit value. The function returns the number of bytes in *pValue:
4825  * 1..4 the number of bytes in *pValue
4826  * 0 unassigned (*pValue undefined)
4827  * -1 illegal (currently not used, *pValue undefined)
4828  *
4829  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4830  * the second to last byte in bits 15..8, etc.
4831  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4832  */
4835  UChar32 c, uint32_t *pValue,
4836  UBool useFallback) {
4837  const int32_t *cx;
4838  const uint16_t *table;
4839 #if 0
4840 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4841  const uint8_t *p;
4842 #endif
4843  uint32_t stage2Entry;
4844  uint32_t value;
4845  int32_t length;
4846 
4847  /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4848  if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4849  table=sharedData->mbcs.fromUnicodeTable;
4850 
4851  /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4852  if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4854  /* is this code point assigned, or do we use fallbacks? */
4855  if(useFallback ? value>=0x800 : value>=0xc00) {
4856  *pValue=value&0xff;
4857  return 1;
4858  }
4859  } else /* outputType!=MBCS_OUTPUT_1 */ {
4860  stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4861 
4862  /* get the bytes and the length for the output */
4863  switch(sharedData->mbcs.outputType) {
4864  case MBCS_OUTPUT_2:
4865  value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4866  if(value<=0xff) {
4867  length=1;
4868  } else {
4869  length=2;
4870  }
4871  break;
4872 #if 0
4873 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4874  case MBCS_OUTPUT_DBCS_ONLY:
4875  /* table with single-byte results, but only DBCS mappings used */
4876  value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4877  if(value<=0xff) {
4878  /* no mapping or SBCS result, not taken for DBCS-only */
4879  value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4880  length=0;
4881  } else {
4882  length=2;
4883  }
4884  break;
4885  case MBCS_OUTPUT_3:
4886  p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4887  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4888  if(value<=0xff) {
4889  length=1;
4890  } else if(value<=0xffff) {
4891  length=2;
4892  } else {
4893  length=3;
4894  }
4895  break;
4896  case MBCS_OUTPUT_4:
4897  value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4898  if(value<=0xff) {
4899  length=1;
4900  } else if(value<=0xffff) {
4901  length=2;
4902  } else if(value<=0xffffff) {
4903  length=3;
4904  } else {
4905  length=4;
4906  }
4907  break;
4908  case MBCS_OUTPUT_3_EUC:
4909  value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4910  /* EUC 16-bit fixed-length representation */
4911  if(value<=0xff) {
4912  length=1;
4913  } else if((value&0x8000)==0) {
4914  value|=0x8e8000;
4915  length=3;
4916  } else if((value&0x80)==0) {
4917  value|=0x8f0080;
4918  length=3;
4919  } else {
4920  length=2;
4921  }
4922  break;
4923  case MBCS_OUTPUT_4_EUC:
4924  p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4925  value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4926  /* EUC 16-bit fixed-length representation applied to the first two bytes */
4927  if(value<=0xff) {
4928  length=1;
4929  } else if(value<=0xffff) {
4930  length=2;
4931  } else if((value&0x800000)==0) {
4932  value|=0x8e800000;
4933  length=4;
4934  } else if((value&0x8000)==0) {
4935  value|=0x8f008000;
4936  length=4;
4937  } else {
4938  length=3;
4939  }
4940  break;
4941 #endif
4942  default:
4943  /* must not occur */
4944  return -1;
4945  }
4946 
4947  /* is this code point assigned, or do we use fallbacks? */
4948  if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4949  (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4950  ) {
4951  /*
4952  * We allow a 0 byte output if the "assigned" bit is set for this entry.
4953  * There is no way with this data structure for fallback output
4954  * to be a zero byte.
4955  */
4956  /* assigned */
4957  *pValue=value;
4958  return length;
4959  }
4960  }
4961  }
4962 
4963  cx=sharedData->mbcs.extIndexes;
4964  if(cx!=NULL) {
4965  length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4966  return length>=0 ? length : -length; /* return abs(length); */
4967  }
4968 
4969  /* unassigned */
4970  return 0;
4971 }
4972 
4973 
4974 #if 0
4975 /*
4976  * This function has been moved to ucnv2022.c for inlining.
4977  * This implementation is here only for documentation purposes
4978  */
4979 
4980 /**
4981  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4982  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4983  * It does not handle conversion extensions (_extFromU()).
4984  *
4985  * It returns the codepage byte for the code point, or -1 if it is unassigned.
4986  */
4989