// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uprops.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2002feb24 * created by: Markus W. Scherer * * Constants for mostly non-core Unicode character properties * stored in uprops.icu. */ #ifndef __UPROPS_H__ #define __UPROPS_H__ #include "unicode/utypes.h" #include "unicode/uset.h" #include "uset_imp.h" #include "udataswp.h" /* indexes[] entries */ enum { … }; /* definitions for the main properties words */ enum { … }; #define GET_CATEGORY(props) … #define CAT_MASK(props) … #define GET_NUMERIC_TYPE_VALUE(props) … /* constants for the storage form of numeric types and values */ enum { … }; #define UPROPS_NTV_GET_TYPE(ntv) … /* number of properties vector words */ #define UPROPS_VECTOR_WORDS … #ifdef __cplusplus namespace { // Properties in vector word 0 // Bits // 31..26 Age major version (major=0..63) // 25..24 Age minor version (minor=0..3) // 23..17 reserved // 16..15 Indic Conjunct Break // 14..12 East Asian Width // 11..10 3..1: Bits 9..0 = Script_Extensions index // 3: Script value from Script_Extensions // 2: Script=Inherited // 1: Script=Common // 0: Script=bits 9..0 // 9.. 0 UScriptCode, or index to Script_Extensions // *Note*: If we need more than the available bits for new properties, // then we could move the Age property out of the properties vectors. // For example, we could store the Age property in its own trie. // In a small, 8-bit-value-width CodePointTrie, it would be larger than // the amount of data that we would save in the properties vectors and their trie, // but the size increase would be a small percentage of the total uprops.icu size. // It would certainly be a much smaller increase than widening the properties vectors. // The savings in the properties vectors+trie from pulling out the Age property // are partly from mediocre correlation between Age and other property values. // (Adding new characters to existing scripts tends to split property vectors where // new characters are similar to old ones.) // See https://github.com/unicode-org/icu/pull/3025 for details. inline constexpr uint32_t UPROPS_AGE_MASK = …; inline constexpr int32_t UPROPS_AGE_SHIFT = …; inline constexpr uint8_t UPROPS_AGE_MAJOR_MAX = …; inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = …; inline constexpr uint32_t UPROPS_EA_MASK = …; inline constexpr int32_t UPROPS_EA_SHIFT = …; inline constexpr uint32_t UPROPS_INCB_MASK = …; inline constexpr int32_t UPROPS_INCB_SHIFT = …; /** Script_Extensions: mask includes Script */ inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = …; // UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_OTHER = …; inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_INHERITED = …; inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_COMMON = …; inline constexpr int32_t UPROPS_MAX_SCRIPT = …; /* * Properties in vector word 1 * Each bit encodes one binary property. * The following constants represent the bit number, use 1<<UPROPS_XYZ. * UPROPS_BINARY_1_TOP<=32! * * Keep this list of property enums in sync with * propListNames[] in icu/source/tools/genprops/props2.c! * * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". */ enum { … }; /* * Properties in vector word 2 * Bits * 31..26 ICU 75: Identifier_Type bit set * ICU 70..74: unused * ICU 57..69: emoji properties; moved to uemoji.icu in ICU 70 * 25..20 Line Break * 19..15 Sentence Break * 14..10 Word Break * 9.. 5 Grapheme Cluster Break * 4.. 0 Decomposition Type */ // https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type // The Identifier_Type maps each code point to a *set* of one or more values. // Some can be combined with others, some can only occur alone. // Exclusion & Limited_Use are combinable bits, but cannot occur together. // We use this forbidden combination for enumerated values. // We use 6 bits for all possible combinations. // If more combinable values are added, then we need to use more bits. // // We do not store separate data for Identifier_Status: // We can derive that from the encoded Identifier_Type via a simple range check. inline constexpr uint32_t UPROPS_2_ID_TYPE_MASK = …; inline constexpr int32_t UPROPS_2_ID_TYPE_SHIFT = …; enum { … }; /** * Maps UIdentifierType to encoded bits. * When UPROPS_ID_TYPE_BIT is set, then use "&" to test whether the value bit is set. * When UPROPS_ID_TYPE_BIT is not set, then compare ("==") the array value with the data value. */ inline constexpr uint8_t uprops_idTypeToEncoded[] = …; } // namespace #endif // __cplusplus #define UPROPS_LB_MASK … #define UPROPS_LB_SHIFT … #define UPROPS_SB_MASK … #define UPROPS_SB_SHIFT … #define UPROPS_WB_MASK … #define UPROPS_WB_SHIFT … #define UPROPS_GCB_MASK … #define UPROPS_GCB_SHIFT … #define UPROPS_DT_MASK … #ifdef __cplusplus namespace { // Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX inline constexpr uint32_t UPROPS_MAX_BLOCK = …; } // namespace #endif // __cplusplus /** * Gets the main properties value for a code point. * Implemented in uchar.c for uprops.cpp. */ U_CFUNC uint32_t u_getMainProperties(UChar32 c); /** * Get a properties vector word for a code point. * Implemented in uchar.c for uprops.cpp. * @return 0 if no data or illegal argument */ U_CFUNC uint32_t u_getUnicodeProperties(UChar32 c, int32_t column); /** * Get the the maximum values for some enum/int properties. * Use the same column numbers as for u_getUnicodeProperties(). * The returned value will contain maximum values stored in the same bit fields * as where the enum values are stored in the u_getUnicodeProperties() * return values for the same columns. * * Valid columns are those for properties words that contain enumerated values. * (ICU 2.6: columns 0 and 2) * For other column numbers, this function will return 0. * * @internal */ U_CFUNC int32_t uprv_getMaxValues(int32_t column); /** * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. * @internal */ U_CFUNC UBool u_isalnumPOSIX(UChar32 c); /** * Checks if c is in * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] * with space=\p{Whitespace} and Control=Cc. * Implements UCHAR_POSIX_GRAPH. * @internal */ U_CFUNC UBool u_isgraphPOSIX(UChar32 c); /** * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. * Implements UCHAR_POSIX_PRINT. * @internal */ U_CFUNC UBool u_isprintPOSIX(UChar32 c); /** Some code points. @internal */ enum { … }; // TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h // need not be C-compatible any more. /** * Get the maximum length of a (regular/1.0/extended) character name. * @return 0 if no character names available. */ U_CAPI int32_t U_EXPORT2 uprv_getMaxCharNameLength(void); /** * Fills set with characters that are used in Unicode character names. * Includes all characters that are used in regular/Unicode 1.0/extended names. * Just empties the set if no character names are available. * @param sa USetAdder to receive characters. */ U_CAPI void U_EXPORT2 uprv_getCharNameCharacters(const USetAdder *sa); /** * Constants for which data and implementation files provide which properties. * Used by UnicodeSet for service-specific property enumeration. * @internal */ enum UPropertySource { … }; UPropertySource; /** * @see UPropertySource * @internal */ U_CFUNC UPropertySource U_EXPORT2 uprops_getSource(UProperty which); /** * Enumerate uprops.icu's main data trie and add the * start of each range of same properties to the set. * @internal */ U_CFUNC void U_EXPORT2 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); /** * Enumerate uprops.icu's properties vectors trie and add the * start of each range of same properties to the set. * @internal */ U_CFUNC void U_EXPORT2 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); U_CFUNC void U_EXPORT2 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode); #ifdef __cplusplus U_CFUNC void U_EXPORT2 ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode); #endif // __cplusplus /** * Return a set of characters for property enumeration. * For each two consecutive characters (start, limit) in the set, * all of the properties for start..limit-1 are all the same. * * @param sa USetAdder to receive result. Existing contents are lost. * @internal */ /*U_CFUNC void U_EXPORT2 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); */ // TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h // need not be C-compatible any more. /** * Swap the ICU Unicode character names file. See uchar.c. * @internal */ U_CAPI int32_t U_EXPORT2 uchar_swapNames(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); #ifdef __cplusplus U_NAMESPACE_BEGIN class UnicodeSet; class CharacterProperties { … }; // implemented in uniset_props.cpp U_CFUNC UnicodeSet * uniset_getUnicode32Instance(UErrorCode &errorCode); U_NAMESPACE_END #endif #endif