// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2000-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ucnvmbcs.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2000jul03 * created by: Markus W. Scherer * * The current code in this file replaces the previous implementation * of conversion code from multi-byte codepages to Unicode and back. * This implementation supports the following: * - legacy variable-length codepages with up to 4 bytes per character * - all Unicode code points (up to 0x10ffff) * - efficient distinction of unassigned vs. illegal byte sequences * - it is possible in fromUnicode() to directly deal with simple * stateful encodings (used for EBCDIC_STATEFUL) * - it is possible to convert Unicode code points * to a single zero byte (but not as a fallback except for SBCS) * * Remaining limitations in fromUnicode: * - byte sequences must not have leading zero bytes * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte * - limitation to up to 4 bytes per character * * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these * limitations and adds m:n character mappings and other features. * See ucnv_ext.h for details. * * Change history: * * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 * macros to ucnvmbcs.h file */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION #include "unicode/ucnv.h" #include "unicode/ucnv_cb.h" #include "unicode/udata.h" #include "unicode/uset.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "ucnv_bld.h" #include "ucnvmbcs.h" #include "ucnv_ext.h" #include "ucnv_cnv.h" #include "cmemory.h" #include "cstring.h" #include "umutex.h" #include "ustr_imp.h" /* control optimizations according to the platform */ #define MBCS_UNROLL_SINGLE_TO_BMP … #define MBCS_UNROLL_SINGLE_FROM_BMP … /* * _MBCSHeader versions 5.3 & 4.3 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) * * This version is optional. Version 5 is used for incompatible data format changes. * makeconv will continue to generate version 4 files if possible. * * Changes from version 4: * * The main difference is an additional _MBCSHeader field with * - the length (number of uint32_t) of the _MBCSHeader * - flags for further incompatible data format changes * - flags for further, backward compatible data format changes * * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from * the file and needs to be reconstituted at load time. * This requires a utf8Friendly format with an additional mbcsIndex table for fast * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. * (For details about these structures see below, and see ucnvmbcs.h.) * * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order * of the Unicode code points. (This requires that the .ucm file has the |0 etc. * precision markers for all mappings.) * * All fallbacks have been moved to the extension table, leaving only roundtrips in the * omitted data that can be reconstituted from the toUnicode data. * * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. * With only roundtrip mappings in the base fromUnicode data, this part is fully * redundant with the mbcsIndex and will be reconstituted from that (also using the * stage 1 table which contains the information about how stage 2 was compacted). * * The rest of the stage 2 table, the part for code points above maxFastUChar, * is stored in the file and will be appended to the reconstituted part. * * The entire fromUBytes array is omitted from the file and will be reconstitued. * This is done by enumerating all toUnicode roundtrip mappings, performing * each mapping (using the stage 1 and reconstituted stage 2 tables) and * writing instead of reading the byte values. * * _MBCSHeader version 4.3 * * Change from version 4.2: * - Optional utf8Friendly data structures, with 64-entry stage 3 block * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS * files which can be used instead of stages 1 & 2. * Faster lookups for roundtrips from most commonly used characters, * and lookups from UTF-8 byte sequences with a natural bit distribution. * See ucnvmbcs.h for more details. * * Change from version 4.1: * - Added an optional extension table structure at the end of the .cnv file. * It is present if the upper bits of the header flags field contains a non-zero * byte offset to it. * Files that contain only a conversion table and no base table * use the special outputType MBCS_OUTPUT_EXT_ONLY. * These contain the base table name between the MBCS header and the extension * data. * * Change from version 4.0: * - Replace header.reserved with header.fromUBytesLength so that all * fields in the data have length. * * Changes from version 3 (for performance improvements): * - new bit distribution for state table entries * - reordered action codes * - new data structure for single-byte fromUnicode * + stage 2 only contains indexes * + stage 3 stores 16 bits per character with classification bits 15..8 * - no multiplier for stage 1 entries * - stage 2 for non-single-byte codepages contains the index and the flags in * one 32-bit value * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers * * For more details about old versions of the MBCS data structure, see * the corresponding versions of this file. * * Converting stateless codepage data ---------------------------------------*** * (or codepage data with simple states) to Unicode. * * Data structure and algorithm for converting from complex legacy codepages * to Unicode. (Designed before 2000-may-22.) * * The basic idea is that the structure of legacy codepages can be described * with state tables. * When reading a byte stream, each input byte causes a state transition. * Some transitions result in the output of a code point, some result in * "unassigned" or "illegal" output. * This is used here for character conversion. * * The data structure begins with a state table consisting of a row * per state, with 256 entries (columns) per row for each possible input * byte value. * Each entry is 32 bits wide, with two formats distinguished by * the sign bit (bit 31): * * One format for transitional entries (bit 31 not set) for non-final bytes, and * one format for final entries (bit 31 set). * Both formats contain the number of the next state in the same bit * positions. * State 0 is the initial state. * * Most of the time, the offset values of subsequent states are added * up to a scalar value. This value will eventually be the index of * the Unicode code point in a table that follows the state table. * The effect is that the code points for final state table rows * are contiguous. The code points of final state rows follow each other * in the order of the references to those final states by previous * states, etc. * * For some terminal states, the offset is itself the output Unicode * code point (16 bits for a BMP code point or 20 bits for a supplementary * code point (stored as code point minus 0x10000 so that 20 bits are enough). * For others, the code point in the Unicode table is stored with either * one or two code units: one for BMP code points, two for a pair of * surrogates. * All code points for a final state entry take up the same number of code * units, regardless of whether they all actually _use_ the same number * of code units. This is necessary for simple array access. * * An additional feature comes in with what in ICU is called "fallback" * mappings: * * In addition to round-trippable, precise, 1:1 mappings, there are often * mappings defined between similar, though not the same, characters. * Typically, such mappings occur only in fromUnicode mapping tables because * Unicode has a superset repertoire of most other codepages. However, it * is possible to provide such mappings in the toUnicode tables, too. * In this case, the fallback mappings are partly integrated into the * general state tables because the structure of the encoding includes their * byte sequences. * For final entries in an initial state, fallback mappings are stored in * the entry itself like with roundtrip mappings. * For other final entries, they are stored in the code units table if * the entry is for a pair of code units. * For single-unit results in the code units table, there is no space to * alternatively hold a fallback mapping; in this case, the code unit * is stored as U+fffe (unassigned), and the fallback mapping needs to * be looked up by the scalar offset value in a separate table. * * "Unassigned" state entries really mean "structurally unassigned", * i.e., such a byte sequence will never have a mapping result. * * The interpretation of the bits in each entry is as follows: * * Bit 31 not set, not a terminal entry ("transitional"): * 30..24 next state * 23..0 offset delta, to be added up * * Bit 31 set, terminal ("final") entry: * 30..24 next state (regardless of action code) * 23..20 action code: * action codes 0 and 1 result in precise-mapping Unicode code points * 0 valid byte sequence * 19..16 not used, 0 * 15..0 16-bit Unicode BMP code point * never U+fffe or U+ffff * 1 valid byte sequence * 19..0 20-bit Unicode supplementary code point * never U+fffe or U+ffff * * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points * 2 valid byte sequence (fallback) * 19..16 not used, 0 * 15..0 16-bit Unicode BMP code point as fallback result * 3 valid byte sequence (fallback) * 19..0 20-bit Unicode supplementary code point as fallback result * * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results * depending on the code units they result in * 4 valid byte sequence * 19..9 not used, 0 * 8..0 final offset delta * pointing to one 16-bit code unit which may be * fffe unassigned -- look for a fallback for this offset * ffff illegal * 5 valid byte sequence * 19..9 not used, 0 * 8..0 final offset delta * pointing to two 16-bit code units * (typically UTF-16 surrogates) * the result depends on the first code unit as follows: * 0000..d7ff roundtrip BMP code point (1st alone) * d800..dbff roundtrip surrogate pair (1st, 2nd) * dc00..dfff fallback surrogate pair (1st-400, 2nd) * e000 roundtrip BMP code point (2nd alone) * e001 fallback BMP code point (2nd alone) * fffe unassigned * ffff illegal * (the final offset deltas are at most 255 * 2, * times 2 because of storing code unit pairs) * * 6 unassigned byte sequence * 19..16 not used, 0 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) * this does not contain a final offset delta because the main * purpose of this action code is to save scalar offset values; * therefore, fallback values cannot be assigned to byte * sequences that result in this action code * 7 illegal byte sequence * 19..16 not used, 0 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) * 8 state change only * 19..0 not used, 0 * useful for state changes in simple stateful encodings, * at Shift-In/Shift-Out codes * * * 9..15 reserved for future use * current implementations will only perform a state change * and ignore bits 19..0 * * An encoding with contiguous ranges of unassigned byte sequences, like * Shift-JIS and especially EUC-TW, can be stored efficiently by having * at least two states for the trail bytes: * One trail byte state that results in code points, and one that only * has "unassigned" and "illegal" terminal states. * * Note: partly by accident, this data structure supports simple stateful * encodings without any additional logic. * Currently, only simple Shift-In/Shift-Out schemes are handled with * appropriate state tables (especially EBCDIC_STATEFUL!). * * MBCS version 2 added: * unassigned and illegal action codes have U+fffe and U+ffff * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() * * Converting from Unicode to codepage bytes --------------------------------*** * * The conversion data structure for fromUnicode is designed for the known * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to * a sequence of 1..4 bytes, in addition to a flag that indicates if there is * a roundtrip mapping. * * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 * like in the character properties table. * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 * with the resulting bytes is at offsetFromUBytes. * * Beginning with version 4, single-byte codepages have a significantly different * trie compared to other codepages. * In all cases, the entry in stage 1 is directly the index of the block of * 64 entries in stage 2. * * Single-byte lookup: * * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. * Stage 3 contains one 16-bit word per result: * Bits 15..8 indicate the kind of result: * f roundtrip result * c fallback result from private-use code point * 8 fallback result from other code points * 0 unassigned * Bits 7..0 contain the codepage byte. A zero byte is always possible. * * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. * ASCII code points can be looked up with a linear array access into stage 3. * See maxFastUChar and other details in ucnvmbcs.h. * * Multi-byte lookup: * * Stage 2 contains a 32-bit word for each 16-block in stage 3: * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) * If this test is false, then a non-zero result will be interpreted as * a fallback mapping. * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) * * Stage 3 contains 2, 3, or 4 bytes per result. * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, * while 3 bytes are stored as bytes in big-endian order. * Leading zero bytes are ignored, and the number of bytes is counted. * A zero byte mapping result is possible as a roundtrip result. * For some output types, the actual result is processed from this; * see ucnv_MBCSFromUnicodeWithOffsets(). * * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), * or (version 3 and up) for BMP-only codepages, it contains 64 entries. * * In version 4.3, a utf8Friendly file contains an mbcsIndex table. * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. * ASCII code points can be looked up with a linear array access into stage 3. * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. * * In version 3, stage 2 blocks may overlap by multiples of the multiplier * for compaction. * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) * may overlap by any number of entries. * * MBCS version 2 added: * the converter checks for known output types, which allows * adding new ones without crashing an unaware converter */ /** * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from * consecutive sequences of bytes, starting from the one encoded in value, * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) * Does not currently support m:n mappings or reverse fallbacks. * This function will not be called for sequences of bytes with leading zeros. * * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() * @param value contains 1..4 bytes of the first byte sequence, right-aligned * @param codePoints resulting Unicode code points, or negative if a byte sequence does * not map to anything * @return true to continue enumeration, false to stop */ UConverterEnumToUCallback; static void U_CALLCONV ucnv_MBCSLoad(UConverterSharedData *sharedData, UConverterLoadArgs *pArgs, const uint8_t *raw, UErrorCode *pErrorCode); static void U_CALLCONV ucnv_MBCSUnload(UConverterSharedData *sharedData); static void U_CALLCONV ucnv_MBCSOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *pErrorCode); static UChar32 U_CALLCONV ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); static void U_CALLCONV ucnv_MBCSGetStarters(const UConverter* cnv, UBool starters[256], UErrorCode *pErrorCode); U_CDECL_BEGIN static const char* U_CALLCONV ucnv_MBCSGetName(const UConverter *cnv); U_CDECL_END static void U_CALLCONV ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode); static UChar32 U_CALLCONV ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); static void U_CALLCONV ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, UConverterToUnicodeArgs *pToUArgs, UErrorCode *pErrorCode); static void U_CALLCONV ucnv_MBCSGetUnicodeSet(const UConverter *cnv, const USetAdder *sa, UConverterUnicodeSet which, UErrorCode *pErrorCode); static void U_CALLCONV ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, UConverterToUnicodeArgs *pToUArgs, UErrorCode *pErrorCode); static const UConverterImpl _SBCSUTF8Impl= …; static const UConverterImpl _DBCSUTF8Impl= …; static const UConverterImpl _MBCSImpl= …; /* Static data is in tools/makeconv/ucnvstat.c for data-based * converters. Be sure to update it as well. */ const UConverterSharedData _MBCSData= …; /* GB 18030 data ------------------------------------------------------------ */ /* helper macros for linear values for GB 18030 four-byte sequences */ #define LINEAR_18030(a, b, c, d) … #define LINEAR_18030_BASE … #define LINEAR(x) … /* * Some ranges of GB 18030 where both the Unicode code points and the * GB four-byte sequences are contiguous and are handled algorithmically by * the special callback functions below. * The values are start & end of Unicode & GB codes. * * Note that single surrogates are not mapped by GB 18030 * as of the re-released mapping tables from 2000-nov-30. */ static const uint32_t gb18030Ranges[14][4]= …; /* bit flag for UConverter.options indicating GB 18030 special handling */ #define _MBCS_OPTION_GB18030 … /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ #define _MBCS_OPTION_KEIS … #define _MBCS_OPTION_JEF … #define _MBCS_OPTION_JIPS … #define KEIS_SO_CHAR_1 … #define KEIS_SO_CHAR_2 … #define KEIS_SI_CHAR_1 … #define KEIS_SI_CHAR_2 … #define JEF_SO_CHAR … #define JEF_SI_CHAR … #define JIPS_SO_CHAR_1 … #define JIPS_SO_CHAR_2 … #define JIPS_SI_CHAR_1 … #define JIPS_SI_CHAR_2 … enum SISO_Option { … }; SISO_Option; static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { … } /* Miscellaneous ------------------------------------------------------------ */ /* similar to ucnv_MBCSGetNextUChar() but recursive */ static UBool enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], int32_t state, uint32_t offset, uint32_t value, UConverterEnumToUCallback *callback, const void *context, UErrorCode *pErrorCode) { … } /* * Only called if stateProps[state]==-1. * A recursive call may do stateProps[state]|=0x40 if this state is the target of an * MBCS_STATE_CHANGE_ONLY. */ static int8_t getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { … } /* * Internal function enumerating the toUnicode data of an MBCS converter. * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U * table, but could also be used for a future ucnv_getUnicodeSet() option * that includes reverse fallbacks (after updating this function's implementation). * Currently only handles roundtrip mappings. * Does not currently handle extensions. */ static void ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, UConverterEnumToUCallback *callback, const void *context, UErrorCode *pErrorCode) { … } U_CFUNC void ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, const USetAdder *sa, UConverterUnicodeSet which, UConverterSetFilter filter, UErrorCode *pErrorCode) { … } U_CFUNC void ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, const USetAdder *sa, UConverterUnicodeSet which, UErrorCode *pErrorCode) { … } static void U_CALLCONV ucnv_MBCSGetUnicodeSet(const UConverter *cnv, const USetAdder *sa, UConverterUnicodeSet which, UErrorCode *pErrorCode) { … } /* conversion extensions for input not in the main table -------------------- */ /* * Hardcoded extension handling for GB 18030. * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. * * In the future, conversion extensions may handle m:n mappings and delta tables, * see https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/conversion/conversion_extensions.html * * If an input character cannot be mapped, then these functions set an error * code. The framework will then call the callback function. */ /* * @return if(U_FAILURE) return the code point for cnv->fromUChar32 * else return 0 after output has been written to the target */ static UChar32 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, UChar32 cp, const char16_t **source, const char16_t *sourceLimit, uint8_t **target, const uint8_t *targetLimit, int32_t **offsets, int32_t sourceIndex, UBool flush, UErrorCode *pErrorCode) { … } /* * Input sequence: cnv->toUBytes[0..length[ * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input * else return 0 after output has been written to the target */ static int8_t _extToU(UConverter *cnv, const UConverterSharedData *sharedData, int8_t length, const uint8_t **source, const uint8_t *sourceLimit, char16_t **target, const char16_t *targetLimit, int32_t **offsets, int32_t sourceIndex, UBool flush, UErrorCode *pErrorCode) { … } /* EBCDIC swap LF<->NL ------------------------------------------------------ */ /* * This code modifies a standard EBCDIC<->Unicode mapping table for * OS/390 (z/OS) Unix System Services (Open Edition). * The difference is in the mapping of Line Feed and New Line control codes: * Standard EBCDIC maps * * <U000A> \x25 |0 * <U0085> \x15 |0 * * but OS/390 USS EBCDIC swaps the control codes for LF and NL, * mapping * * <U000A> \x15 |0 * <U0085> \x25 |0 * * This code modifies a loaded standard EBCDIC<->Unicode mapping table * by copying it into allocated memory and swapping the LF and NL values. * It allows to support the same EBCDIC charset in both versions without * duplicating the entire installed table. */ /* standard EBCDIC codes */ #define EBCDIC_LF … #define EBCDIC_NL … /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ #define EBCDIC_RT_LF … #define EBCDIC_RT_NL … /* Unicode code points */ #define U_LF … #define U_NL … static UBool _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { … } /* reconstitute omitted fromUnicode data ------------------------------------ */ /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ static UBool U_CALLCONV writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { … } static void reconstituteData(UConverterMBCSTable *mbcsTable, uint32_t stage1Length, uint32_t stage2Length, uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ UErrorCode *pErrorCode) { … } /* MBCS setup functions ----------------------------------------------------- */ static void U_CALLCONV ucnv_MBCSLoad(UConverterSharedData *sharedData, UConverterLoadArgs *pArgs, const uint8_t *raw, UErrorCode *pErrorCode) { … } static void U_CALLCONV ucnv_MBCSUnload(UConverterSharedData *sharedData) { … } static void U_CALLCONV ucnv_MBCSOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *pErrorCode) { … } U_CDECL_BEGIN static const char* U_CALLCONV ucnv_MBCSGetName(const UConverter *cnv) { … } U_CDECL_END /* MBCS-to-Unicode conversion functions ------------------------------------- */ static UChar32 U_CALLCONV ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { … } /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ static void ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages * that only map to and from the BMP. * In addition to single-byte optimizations, the offset calculations * become much easier. */ static void ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } static UBool hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { … } /* * Is byte b a single/lead byte in this state? * Recurse for transition states, because here we don't want to say that * b is a lead byte if all byte sequences that start with b are illegal. */ static UBool isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { … } U_CFUNC void ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. * We still need a conversion loop in case we find reserved action codes, which are to be ignored. */ static UChar32 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* * Version of _MBCSToUnicodeWithOffsets() optimized for single-character * conversion without offset handling. * * When a character does not have a mapping to Unicode, then we return to the * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback * handling. * We also defer to the generic code in other complicated cases and have them * ultimately handled by _MBCSToUnicodeWithOffsets() itself. * * All normal mappings and errors are handled here. */ static UChar32 U_CALLCONV ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } #if 0 /* * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus * Removal improves code coverage. */ /** * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. * It does not handle the EBCDIC swaplfnl option (set in UConverter). * It does not handle conversion extensions (_extToU()). */ U_CFUNC UChar32 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, uint8_t b, UBool useFallback) { int32_t entry; uint8_t action; entry=sharedData->mbcs.stateTable[0][b]; /* MBCS_ENTRY_IS_FINAL(entry) */ if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { /* output BMP code point */ return (char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); } /* * An if-else-if chain provides more reliable performance for * the most common cases compared to a switch. */ action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); if(action==MBCS_STATE_VALID_DIRECT_20) { /* output supplementary code point */ return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { if(!TO_U_USE_FALLBACK(useFallback)) { return 0xfffe; } /* output BMP code point */ return (char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { if(!TO_U_USE_FALLBACK(useFallback)) { return 0xfffe; } /* output supplementary code point */ return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); } else if(action==MBCS_STATE_UNASSIGNED) { return 0xfffe; } else if(action==MBCS_STATE_ILLEGAL) { return 0xffff; } else { /* reserved, must never occur */ return 0xffff; } } #endif /* * This is a simple version of _MBCSGetNextUChar() that is used * by other converter implementations. * It only returns an "assigned" result if it consumes the entire input. * It does not use state from the converter, nor error codes. * It does not handle the EBCDIC swaplfnl option (set in UConverter). * It handles conversion extensions but not GB 18030. * * Return value: * U+fffe unassigned * U+ffff illegal * otherwise the Unicode code point */ U_CFUNC UChar32 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, const char *source, int32_t length, UBool useFallback) { … } /* MBCS-from-Unicode conversion functions ----------------------------------- */ /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ static void ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ static void ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages * that map only to and from the BMP. * In addition to single-byte/state optimizations, the offset calculations * become much easier. * It would be possible to use the sbcsIndex for UTF-8-friendly tables, * but measurements have shown that this diminishes performance * in more cases than it improves it. * See SVN revision 21013 (2007-feb-06) for the last version with #if switches * for various MBCS and SBCS optimizations. */ static void ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } U_CFUNC void ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { … } /* * This is another simple conversion function for internal use by other * conversion implementations. * It does not use the converter state nor call callbacks. * It does not handle the EBCDIC swaplfnl option (set in UConverter). * It handles conversion extensions but not GB 18030. * * It converts one single Unicode code point into codepage bytes, encoded * as one 32-bit value. The function returns the number of bytes in *pValue: * 1..4 the number of bytes in *pValue * 0 unassigned (*pValue undefined) * -1 illegal (currently not used, *pValue undefined) * * *pValue will contain the resulting bytes with the last byte in bits 7..0, * the second to last byte in bits 15..8, etc. * Currently, the function assumes but does not check that 0<=c<=0x10ffff. */ U_CFUNC int32_t ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, UChar32 c, uint32_t *pValue, UBool useFallback) { … } #if 0 /* * This function has been moved to ucnv2022.c for inlining. * This implementation is here only for documentation purposes */ /** * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. * It does not handle the EBCDIC swaplfnl option (set in UConverter). * It does not handle conversion extensions (_extFromU()). * * It returns the codepage byte for the code point, or -1 if it is unassigned. */ U_CFUNC int32_t ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, UChar32 c, UBool useFallback) { const uint16_t *table; int32_t value; /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { return -1; } /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ table=sharedData->mbcs.fromUnicodeTable; /* get the byte for the output */ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); /* is this code point assigned, or do we use fallbacks? */ if(useFallback ? value>=0x800 : value>=0xc00) { return value&0xff; } else { return -1; } } #endif /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ static const UChar32 utf8_offsets[5]= …; static void U_CALLCONV ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, UConverterToUnicodeArgs *pToUArgs, UErrorCode *pErrorCode) { … } static void U_CALLCONV ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, UConverterToUnicodeArgs *pToUArgs, UErrorCode *pErrorCode) { … } /* miscellaneous ------------------------------------------------------------ */ static void U_CALLCONV ucnv_MBCSGetStarters(const UConverter* cnv, UBool starters[256], UErrorCode *) { … } /* * This is an internal function that allows other converter implementations * to check whether a byte is a lead byte. */ U_CFUNC UBool ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { … } static void U_CALLCONV ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode) { … } U_CFUNC UConverterType ucnv_MBCSGetType(const UConverter* converter) { … } #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */