// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: unames.c * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 1999oct04 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/uchar.h" #include "unicode/udata.h" #include "unicode/utf.h" #include "unicode/utf16.h" #include "uassert.h" #include "ustr_imp.h" #include "umutex.h" #include "cmemory.h" #include "cstring.h" #include "ucln_cmn.h" #include "udataswp.h" #include "uprops.h" U_NAMESPACE_BEGIN /* prototypes ------------------------------------------------------------- */ static const char DATA_NAME[] = …; static const char DATA_TYPE[] = …; #define GROUP_SHIFT … #define LINES_PER_GROUP … #define GROUP_MASK … /* * This struct was replaced by explicitly accessing equivalent * fields from triples of uint16_t. * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, * which broke the assumption that sizeof(Group)==6 and that the ++ operator * would advance by 6 bytes (3 uint16_t). * * We can't just change the data structure because it's loaded from a data file, * and we don't want to make it less compact, so we changed the access code. * * For details see ICU tickets 6331 and 6008. typedef struct { uint16_t groupMSB, offsetHigh, offsetLow; / * avoid padding * / } Group; */ enum { … }; /* * Get the 32-bit group offset. * @param group (const uint16_t *) pointer to a Group triple of uint16_t * @return group offset (int32_t) */ #define GET_GROUP_OFFSET(group) … #define NEXT_GROUP(group) … #define PREV_GROUP(group) … AlgorithmicRange; UCharNames; /* * Get the groups table from a UCharNames struct. * The groups table consists of one uint16_t groupCount followed by * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH * and the comment for the old struct Group above. * * @param names (const UCharNames *) pointer to the UCharNames indexes * @return (const uint16_t *) pointer to the groups table */ #define GET_GROUPS(names) … FindName; #define DO_FIND_NAME … static UDataMemory *uCharNamesData= …; static UCharNames *uCharNames= …; static icu::UInitOnce gCharNamesInitOnce { … }; /* * Maximum length of character names (regular & 1.0). */ static int32_t gMaxNameLength= …; /* * Set of chars used in character names (regular & 1.0). * Chars are platform-dependent (can be EBCDIC). */ static uint32_t gNameSet[8]= …; #define U_NONCHARACTER_CODE_POINT … #define U_LEAD_SURROGATE … #define U_TRAIL_SURROGATE … #define U_CHAR_EXTENDED_CATEGORY_COUNT … static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = …; /* implementation ----------------------------------------------------------- */ static UBool U_CALLCONV unames_cleanup() { … } static UBool U_CALLCONV isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/, const UDataInfo *pInfo) { … } static void U_CALLCONV loadCharNames(UErrorCode &status) { … } static UBool isDataLoaded(UErrorCode *pErrorCode) { … } #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) … #define U_ISO_COMMENT … /* * Important: expandName() and compareName() are almost the same - * apply fixes to both. * * UnicodeData.txt uses ';' as a field separator, so no * field can contain ';' as part of its contents. * In unames.dat, it is marked as token[';']==-1 only if the * semicolon is used in the data file - which is iff we * have Unicode 1.0 names or ISO comments or aliases. * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases * although we know that it will never be part of a name. */ static uint16_t expandName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { … } /* * compareName() is almost the same as expandName() except that it compares * the currently expanded name to an input name. * It returns the match/no match result as soon as possible. */ static UBool compareName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, const char *otherName) { … } static uint8_t getCharCat(UChar32 cp) { … } static const char *getCharCatName(UChar32 cp) { … } static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { … } /* * getGroup() does a binary search for the group that contains the * Unicode code point "code". * The return value is always a valid Group* that may contain "code" * or else is the highest group before "code". * If the lowest group is after "code", then that one is returned. */ static const uint16_t * getGroup(UCharNames *names, uint32_t code) { … } /* * expandGroupLengths() reads a block of compressed lengths of 32 strings and * expands them into offsets and lengths for each string. * Lengths are stored with a variable-width encoding in consecutive nibbles: * If a nibble<0xc, then it is the length itself (0=empty string). * If a nibble>=0xc, then it forms a length value with the following nibble. * Calculation see below. * The offsets and lengths arrays must be at least 33 (one more) long because * there is no check here at the end if the last nibble is still used. */ static const uint8_t * expandGroupLengths(const uint8_t *s, uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { … } static uint16_t expandGroupName(UCharNames *names, const uint16_t *group, uint16_t lineNumber, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { … } static uint16_t getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { … } /* * enumGroupNames() enumerates all the names in a 32-group * and either calls the enumerator function or finds a given input name. */ static UBool enumGroupNames(UCharNames *names, const uint16_t *group, UChar32 start, UChar32 end, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { … } /* * enumExtNames enumerate extended names. * It only needs to do it if it is called with a real function and not * with the dummy DO_FIND_NAME, because u_charFromName() does a check * for extended names by itself. */ static UBool enumExtNames(UChar32 start, UChar32 end, UEnumCharNamesFn *fn, void *context) { … } static UBool enumNames(UCharNames *names, UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { … } static uint16_t writeFactorSuffix(const uint16_t *factors, uint16_t count, const char *s, /* suffix elements */ uint32_t code, uint16_t indexes[8], /* output fields from here */ const char *elementBases[8], const char *elements[8], char *buffer, uint16_t bufferLength) { … } /* * Important: * Parts of findAlgName() are almost the same as some of getAlgName(). * Fixes must be applied to both. */ static uint16_t getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { … } /* * Important: enumAlgNames() and findAlgName() are almost the same. * Any fix must be applied to both. */ static UBool enumAlgNames(AlgorithmicRange *range, UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { … } /* * findAlgName() is almost the same as enumAlgNames() except that it * returns the code point for a name if it fits into the range. * It returns 0xffff otherwise. */ static UChar32 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { … } /* sets of name characters, maximum name lengths ---------------------------- */ #define SET_ADD(set, c) … #define SET_CONTAINS(set, c) … static int32_t calcStringSetLength(uint32_t set[8], const char *s) { … } static int32_t calcAlgNameSetsLengths(int32_t maxNameLength) { … } static int32_t calcExtNameSetsLengths(int32_t maxNameLength) { … } static int32_t calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, uint32_t set[8], const uint8_t **pLine, const uint8_t *lineLimit) { … } static void calcGroupNameSetsLengths(int32_t maxNameLength) { … } static UBool calcNameSetsLengths(UErrorCode *pErrorCode) { … } U_NAMESPACE_END /* public API --------------------------------------------------------------- */ U_NAMESPACE_USE U_CAPI int32_t U_EXPORT2 u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode) { … } U_CAPI int32_t U_EXPORT2 u_getISOComment(UChar32 /*c*/, char *dest, int32_t destCapacity, UErrorCode *pErrorCode) { … } U_CAPI UChar32 U_EXPORT2 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode) { … } U_CAPI void U_EXPORT2 u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode) { … } U_CAPI int32_t U_EXPORT2 uprv_getMaxCharNameLength() { … } /** * Converts the char set cset into a Unicode set uset. * @param cset Set of 256 bit flags corresponding to a set of chars. * @param uset USet to receive characters. Existing contents are deleted. */ static void charSetToUSet(uint32_t cset[8], const USetAdder *sa) { … } /** * Fills set with characters that are used in Unicode character names. * @param set USet to receive characters. */ U_CAPI void U_EXPORT2 uprv_getCharNameCharacters(const USetAdder *sa) { … } /* data swapping ------------------------------------------------------------ */ /* * The token table contains non-negative entries for token bytes, * and -1 for bytes that represent themselves in the data file's charset. * -2 entries are used for lead bytes. * * Direct bytes (-1 entries) must be translated from the input charset family * to the output charset family. * makeTokenMap() writes a permutation mapping for this. * Use it once for single-/lead-byte tokens and once more for all trail byte * tokens. (';' is an unused trail byte marked with -1.) */ static void makeTokenMap(const UDataSwapper *ds, int16_t tokens[], uint16_t tokenCount, uint8_t map[256], UErrorCode *pErrorCode) { … } U_CAPI int32_t U_EXPORT2 uchar_swapNames(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { … } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */