// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2002-2016, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uprops.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2002feb24 * created by: Markus W. Scherer * * Constants for mostly non-core Unicode character properties * stored in uprops.icu. */ #ifndef __UPROPS_H__ #define __UPROPS_H__ #include "unicode/utypes.h" #include "unicode/uset.h" #include "uset_imp.h" #include "udataswp.h" /* indexes[] entries */ enum { … }; /* definitions for the main properties words */ enum { … }; #define GET_CATEGORY(props) … #define CAT_MASK(props) … #define GET_NUMERIC_TYPE_VALUE(props) … /* constants for the storage form of numeric types and values */ enum { … }; #define UPROPS_NTV_GET_TYPE(ntv) … /* number of properties vector words */ #define UPROPS_VECTOR_WORDS … /* * Properties in vector word 0 * Bits * 31..24 DerivedAge version major/minor one nibble each * 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index * 3: Script value from Script_Extensions * 2: Script=Inherited * 1: Script=Common * 0: Script=bits 21..20 & 7..0 * 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions * 19..17 East Asian Width * 16.. 8 UBlockCode * 7.. 0 UScriptCode, or index to Script_Extensions */ /* derived age: one nibble each for major and minor version numbers */ #define UPROPS_AGE_MASK … #define UPROPS_AGE_SHIFT … /* Script_Extensions: mask includes Script */ #define UPROPS_SCRIPT_X_MASK … #define UPROPS_SCRIPT_X_SHIFT … // The UScriptCode or Script_Extensions index is split across two bit fields. // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) // Shift the high bits right by 12 to assemble the full value. #define UPROPS_SCRIPT_HIGH_MASK … #define UPROPS_SCRIPT_HIGH_SHIFT … #define UPROPS_MAX_SCRIPT … #define UPROPS_EA_MASK … #define UPROPS_EA_SHIFT … #define UPROPS_BLOCK_MASK … #define UPROPS_BLOCK_SHIFT … #define UPROPS_SCRIPT_LOW_MASK … /* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ #define UPROPS_SCRIPT_X_WITH_COMMON … #define UPROPS_SCRIPT_X_WITH_INHERITED … #define UPROPS_SCRIPT_X_WITH_OTHER … #ifdef __cplusplus namespace { inline uint32_t uprops_mergeScriptCodeOrIndex(uint32_t scriptX) { … } } // namespace #endif // __cplusplus /* * Properties in vector word 1 * Each bit encodes one binary property. * The following constants represent the bit number, use 1<<UPROPS_XYZ. * UPROPS_BINARY_1_TOP<=32! * * Keep this list of property enums in sync with * propListNames[] in icu/source/tools/genprops/props2.c! * * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". */ enum { … }; /* * Properties in vector word 2 * Bits * 31..26 ICU 75: Identifier_Type bit set * ICU 70..74: unused * ICU 57..69: emoji properties; moved to uemoji.icu in ICU 70 * 25..20 Line Break * 19..15 Sentence Break * 14..10 Word Break * 9.. 5 Grapheme Cluster Break * 4.. 0 Decomposition Type */ #ifdef __cplusplus // https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type // The Identifier_Type maps each code point to a *set* of one or more values. // Some can be combined with others, some can only occur alone. // Exclusion & Limited_Use are combinable bits, but cannot occur together. // We use this forbidden combination for enumerated values. // We use 6 bits for all possible combinations. // If more combinable values are added, then we need to use more bits. // // We do not store separate data for Identifier_Status: // We can derive that from the encoded Identifier_Type via a simple range check. inline constexpr uint32_t UPROPS_2_ID_TYPE_MASK = …; inline constexpr int32_t UPROPS_2_ID_TYPE_SHIFT = …; enum { … }; /** * Maps UIdentifierType to encoded bits. * When UPROPS_ID_TYPE_BIT is set, then use "&" to test whether the value bit is set. * When UPROPS_ID_TYPE_BIT is not set, then compare ("==") the array value with the data value. */ inline constexpr uint8_t uprops_idTypeToEncoded[] = …; #endif // __cplusplus #define UPROPS_LB_MASK … #define UPROPS_LB_SHIFT … #define UPROPS_SB_MASK … #define UPROPS_SB_SHIFT … #define UPROPS_WB_MASK … #define UPROPS_WB_SHIFT … #define UPROPS_GCB_MASK … #define UPROPS_GCB_SHIFT … #define UPROPS_DT_MASK … /** * Gets the main properties value for a code point. * Implemented in uchar.c for uprops.cpp. */ U_CFUNC uint32_t u_getMainProperties(UChar32 c); /** * Get a properties vector word for a code point. * Implemented in uchar.c for uprops.cpp. * @return 0 if no data or illegal argument */ U_CFUNC uint32_t u_getUnicodeProperties(UChar32 c, int32_t column); /** * Get the the maximum values for some enum/int properties. * Use the same column numbers as for u_getUnicodeProperties(). * The returned value will contain maximum values stored in the same bit fields * as where the enum values are stored in the u_getUnicodeProperties() * return values for the same columns. * * Valid columns are those for properties words that contain enumerated values. * (ICU 2.6: columns 0 and 2) * For other column numbers, this function will return 0. * * @internal */ U_CFUNC int32_t uprv_getMaxValues(int32_t column); /** * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. * @internal */ U_CFUNC UBool u_isalnumPOSIX(UChar32 c); /** * Checks if c is in * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] * with space=\p{Whitespace} and Control=Cc. * Implements UCHAR_POSIX_GRAPH. * @internal */ U_CFUNC UBool u_isgraphPOSIX(UChar32 c); /** * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. * Implements UCHAR_POSIX_PRINT. * @internal */ U_CFUNC UBool u_isprintPOSIX(UChar32 c); /** Some code points. @internal */ enum { … }; /** * Get the maximum length of a (regular/1.0/extended) character name. * @return 0 if no character names available. */ U_CAPI int32_t U_EXPORT2 uprv_getMaxCharNameLength(void); /** * Fills set with characters that are used in Unicode character names. * Includes all characters that are used in regular/Unicode 1.0/extended names. * Just empties the set if no character names are available. * @param sa USetAdder to receive characters. */ U_CAPI void U_EXPORT2 uprv_getCharNameCharacters(const USetAdder *sa); /** * Constants for which data and implementation files provide which properties. * Used by UnicodeSet for service-specific property enumeration. * @internal */ enum UPropertySource { … }; UPropertySource; /** * @see UPropertySource * @internal */ U_CFUNC UPropertySource U_EXPORT2 uprops_getSource(UProperty which); /** * Enumerate uprops.icu's main data trie and add the * start of each range of same properties to the set. * @internal */ U_CFUNC void U_EXPORT2 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); /** * Enumerate uprops.icu's properties vectors trie and add the * start of each range of same properties to the set. * @internal */ U_CFUNC void U_EXPORT2 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); U_CFUNC void U_EXPORT2 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode); /** * Return a set of characters for property enumeration. * For each two consecutive characters (start, limit) in the set, * all of the properties for start..limit-1 are all the same. * * @param sa USetAdder to receive result. Existing contents are lost. * @internal */ /*U_CFUNC void U_EXPORT2 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); */ /** * Swap the ICU Unicode character names file. See uchar.c. * @internal */ U_CAPI int32_t U_EXPORT2 uchar_swapNames(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); #ifdef __cplusplus U_NAMESPACE_BEGIN class UnicodeSet; class CharacterProperties { … }; // implemented in uniset_props.cpp U_CFUNC UnicodeSet * uniset_getUnicode32Instance(UErrorCode &errorCode); U_NAMESPACE_END #endif #endif