// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * * ucnv_io.cpp: * initializes global variables and defines functions pertaining to converter * name resolution aspect of the conversion code. * * new implementation: * * created on: 1999nov22 * created by: Markus W. Scherer * * Use the binary cnvalias.icu (created from convrtrs.txt) to work * with aliases for converter names. * * Date Name Description * 11/22/1999 markus Created * 06/28/2002 grhoten Major overhaul of the converter alias design. * Now an alias can map to different converters * depending on the specified standard. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/ucnv.h" #include "unicode/udata.h" #include "umutex.h" #include "uarrsort.h" #include "uassert.h" #include "udataswp.h" #include "cstring.h" #include "cmemory.h" #include "ucnv_io.h" #include "uenumimp.h" #include "ucln_cmn.h" /* Format of cnvalias.icu ----------------------------------------------------- * * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. * This binary form contains several tables. All indexes are to uint16_t * units, and not to the bytes (uint8_t units). Addressing everything on * 16-bit boundaries allows us to store more information with small index * numbers, which are also 16-bit in size. The majority of the table (except * the string table) are 16-bit numbers. * * First there is the size of the Table of Contents (TOC). The TOC * entries contain the size of each section. In order to find the offset * you just need to sum up the previous offsets. * The TOC length and entries are an array of uint32_t values. * The first section after the TOC starts immediately after the TOC. * * 1) This section contains a list of converters. This list contains indexes * into the string table for the converter name. The index of this list is * also used by other sections, which are mentioned later on. * This list is not sorted. * * 2) This section contains a list of tags. This list contains indexes * into the string table for the tag name. The index of this list is * also used by other sections, which are mentioned later on. * This list is in priority order of standards. * * 3) This section contains a list of sorted unique aliases. This * list contains indexes into the string table for the alias name. The * index of this list is also used by other sections, like the 4th section. * The index for the 3rd and 4th section is used to get the * alias -> converter name mapping. Section 3 and 4 form a two column table. * Some of the most significant bits of each index may contain other * information (see findConverter for details). * * 4) This section contains a list of mapped converter names. Consider this * as a table that maps the 3rd section to the 1st section. This list contains * indexes into the 1st section. The index of this list is the same index in * the 3rd section. There is also some extra information in the high bits of * each converter index in this table. Currently it's only used to say that * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is * the predigested form of the 5th section so that an alias lookup can be fast. * * 5) This section contains a 2D array with indexes to the 6th section. This * section is the full form of all alias mappings. The column index is the * index into the converter list (column header). The row index is the index * to tag list (row header). This 2D array is the top part a 3D array. The * third dimension is in the 6th section. * * 6) This is blob of variable length arrays. Each array starts with a size, * and is followed by indexes to alias names in the string table. This is * the third dimension to the section 5. No other section should be referencing * this section. * * 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its * presence indicates that a section 9 exists. UConverterAliasOptions specifies * what type of string normalization is used among other potential things in the * future. * * 8) This is the string table. All strings are indexed on an even address. * There are two reasons for this. First many chip architectures locate strings * faster on even address boundaries. Second, since all indexes are 16-bit * numbers, this string table can be 128KB in size instead of 64KB when we * only have strings starting on an even address. * * 9) When present this is a set of prenormalized strings from section 8. This * table contains normalized strings with the dashes and spaces stripped out, * and all strings lowercased. In the future, the options in section 7 may state * other types of normalization. * * Here is the concept of section 5 and 6. It's a 3D cube. Each tag * has a unique alias among all converters. That same alias can * be mentioned in other standards on different converters, * but only one alias per tag can be unique. * * * Converter Names (Usually in TR22 form) * -------------------------------------------. * T / /| * a / / | * g / / | * s / / | * / / | * ------------------------------------------/ | * A | | | * l | | | * i | | / * a | | / * s | | / * e | | / * s | |/ * ------------------------------------------- * * * * Here is what it really looks like. It's like swiss cheese. * There are holes. Some converters aren't recognized by * a standard, or they are really old converters that the * standard doesn't recognize anymore. * * Converter Names (Usually in TR22 form) * -------------------------------------------. * T /##########################################/| * a / # # /# * g / # ## ## ### # ### ### ### #/ * s / # ##### #### ## ## #/# * / ### # # ## # # # ### # # #/## * ------------------------------------------/# # * A |### # # ## # # # ### # # #|# # * l |# # # # # ## # #|# # * i |# # # # # # #|# * a |# #|# * s | #|# * e * s * */ /** * Used by the UEnumeration API */ UAliasContext; static const char DATA_NAME[] = …; static const char DATA_TYPE[] = …; static UDataMemory *gAliasData= …; static icu::UInitOnce gAliasDataInitOnce { … }; enum { … }; static const UConverterAliasOptions defaultTableOptions = …; static UConverterAlias gMainTable; #define GET_STRING(idx) … #define GET_NORMALIZED_STRING(idx) … static UBool U_CALLCONV isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/, const UDataInfo *pInfo) { … } static UBool U_CALLCONV ucnv_io_cleanup() { … } static void U_CALLCONV initAliasData(UErrorCode &errCode) { … } static UBool haveAliasData(UErrorCode *pErrorCode) { … } static inline UBool isAlias(const char *alias, UErrorCode *pErrorCode) { … } static uint32_t getTagNumber(const char *tagname) { … } /* character types relevant for ucnv_compareNames() */ enum { … }; /* character types for ASCII 00..7F */ static const uint8_t asciiTypes[128] = …; #define GET_ASCII_TYPE(c) … /* character types for EBCDIC 80..FF */ static const uint8_t ebcdicTypes[128] = …; #define GET_EBCDIC_TYPE(c) … #if U_CHARSET_FAMILY==U_ASCII_FAMILY #define GET_CHAR_TYPE(c) … #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY #define GET_CHAR_TYPE … #else # error U_CHARSET_FAMILY is not valid #endif /* @see ucnv_compareNames */ U_CAPI char * U_CALLCONV ucnv_io_stripASCIIForCompare(char *dst, const char *name) { … } U_CAPI char * U_CALLCONV ucnv_io_stripEBCDICForCompare(char *dst, const char *name) { … } /** * Do a fuzzy compare of two converter/alias names. * The comparison is case-insensitive, ignores leading zeroes if they are not * followed by further digits, and ignores all but letters and digits. * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 * at http://www.unicode.org/reports/tr22/ * * This is a symmetrical (commutative) operation; order of arguments * is insignificant. This is an important property for sorting the * list (when the list is preprocessed into binary form) and for * performing binary searches on it at run time. * * @param name1 a converter name or alias, zero-terminated * @param name2 a converter name or alias, zero-terminated * @return 0 if the names match, or a negative value if the name1 * lexically precedes name2, or a positive value if the name1 * lexically follows name2. * * @see ucnv_io_stripForCompare */ U_CAPI int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2) { … } /* * search for an alias * return the converter number index for gConverterList */ static inline uint32_t findConverter(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { … } /* * Is this alias in this list? * alias and listOffset should be non-nullptr. */ static inline UBool isAliasInList(const char *alias, uint32_t listOffset) { … } /* * Search for an standard name of an alias (what is the default name * that this standard uses?) * return the listOffset for gTaggedAliasLists. If it's 0, * the it couldn't be found, but the parameters are valid. */ static uint32_t findTaggedAliasListsOffset(const char *alias, const char *standard, UErrorCode *pErrorCode) { … } /* Return the canonical name */ static uint32_t findTaggedConverterNum(const char *alias, const char *standard, UErrorCode *pErrorCode) { … } U_CAPI const char * ucnv_io_getConverterName(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) { … } U_CDECL_BEGIN static int32_t U_CALLCONV ucnv_io_countStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { … } static const char * U_CALLCONV ucnv_io_nextStandardAliases(UEnumeration *enumerator, int32_t* resultLength, UErrorCode * /*pErrorCode*/) { … } static void U_CALLCONV ucnv_io_resetStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { … } static void U_CALLCONV ucnv_io_closeUEnumeration(UEnumeration *enumerator) { … } U_CDECL_END /* Enumerate the aliases for the specified converter and standard tag */ static const UEnumeration gEnumAliases = …; U_CAPI UEnumeration * U_EXPORT2 ucnv_openStandardNames(const char *convName, const char *standard, UErrorCode *pErrorCode) { … } static uint16_t ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) { … } static uint16_t ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) { … } static const char * ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { … } static uint16_t ucnv_io_countStandards(UErrorCode *pErrorCode) { … } U_CAPI const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { … } U_CAPI const char * U_EXPORT2 ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { … } U_CAPI uint16_t U_EXPORT2 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) { … } U_CAPI const char* U_EXPORT2 ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { … } U_CAPI void U_EXPORT2 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) { … } U_CAPI uint16_t U_EXPORT2 ucnv_countStandards() { … } U_CAPI const char * U_EXPORT2 ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode) { … } U_CDECL_BEGIN static int32_t U_CALLCONV ucnv_io_countAllConverters(UEnumeration * /*enumerator*/, UErrorCode * /*pErrorCode*/) { … } static const char * U_CALLCONV ucnv_io_nextAllConverters(UEnumeration *enumerator, int32_t* resultLength, UErrorCode * /*pErrorCode*/) { … } static void U_CALLCONV ucnv_io_resetAllConverters(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) { … } U_CDECL_END static const UEnumeration gEnumAllConverters = …; U_CAPI UEnumeration * U_EXPORT2 ucnv_openAllNames(UErrorCode *pErrorCode) { … } U_CAPI uint16_t ucnv_io_countKnownConverters(UErrorCode *pErrorCode) { … } /* alias table swapping ----------------------------------------------------- */ U_CDECL_BEGIN StripForCompareFn; U_CDECL_END /* * row of a temporary array * * gets platform-endian charset string indexes and sorting indexes; * after sorting this array by strings, the actual arrays are permutated * according to the sorting indexes */ TempRow; TempAliasTable; enum { … }; static int32_t U_CALLCONV io_compareRows(const void *context, const void *left, const void *right) { … } U_CAPI int32_t U_EXPORT2 ucnv_swapAliases(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { … } #endif /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */