// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * File UCHAR.H * * Modification History: * * Date Name Description * 04/02/97 aliu Creation. * 03/29/99 helena Updated for C APIs. * 4/15/99 Madhu Updated for C Implementation and Javadoc * 5/20/99 Madhu Added the function u_getVersion() * 8/19/1999 srl Upgraded scripts to Unicode 3.0 * 8/27/1999 schererm UCharDirection constants: U_... * 11/11/1999 weiv added u_isalnum(), cleaned comments * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion(). ****************************************************************************** */ #ifndef UCHAR_H #define UCHAR_H #include <stdbool.h> #include "unicode/utypes.h" #include "unicode/stringoptions.h" #include "unicode/ucpmap.h" #if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN) #define USET_DEFINED /** * USet is the C API type corresponding to C++ class UnicodeSet. * It is forward-declared here to avoid including unicode/uset.h file if related * APIs are not used. * * @see ucnv_getUnicodeSet * @stable ICU 2.4 */ USet; #endif U_CDECL_BEGIN /*==========================================================================*/ /* Unicode version number */ /*==========================================================================*/ /** * Unicode version number, default for the current ICU version. * The actual Unicode Character Database (UCD) data is stored in uprops.dat * and may be generated from UCD files from a different Unicode version. * Call u_getUnicodeVersion to get the actual Unicode version of the data. * * @see u_getUnicodeVersion * @stable ICU 2.0 */ #define U_UNICODE_VERSION … /** * \file * \brief C API: Unicode Properties * * This C API provides low-level access to the Unicode Character Database. * In addition to raw property values, some convenience functions calculate * derived properties, for example for Java-style programming. * * Unicode assigns each code point (not just assigned character) values for * many properties. * Most of them are simple boolean flags, or constants from a small enumerated list. * For some properties, values are strings or other relatively more complex types. * * For more information see * "About the Unicode Character Database" (http://www.unicode.org/ucd/) * and the ICU User Guide chapter on Properties (https://unicode-org.github.io/icu/userguide/strings/properties). * * Many properties are accessible via generic functions that take a UProperty selector. * - u_hasBinaryProperty() returns a binary value (true/false) per property and code point. * - u_getIntPropertyValue() returns an integer value per property and code point. * For each supported enumerated or catalog property, there is * an enum type for all of the property's values, and * u_getIntPropertyValue() returns the numeric values of those constants. * - u_getBinaryPropertySet() returns a set for each ICU-supported binary property with * all code points for which the property is true. * - u_getIntPropertyMap() returns a map for each * ICU-supported enumerated/catalog/int-valued property which * maps all Unicode code points to their values for that property. * * Many functions are designed to match java.lang.Character functions. * See the individual function documentation, * and see the JDK 1.4 java.lang.Character documentation * at http://java.sun.com/j2se/1.4/docs/api/java/lang/Character.html * * There are also functions that provide easy migration from C/POSIX functions * like isblank(). Their use is generally discouraged because the C/POSIX * standards do not define their semantics beyond the ASCII range, which means * that different implementations exhibit very different behavior. * Instead, Unicode properties should be used directly. * * There are also only a few, broad C/POSIX character classes, and they tend * to be used for conflicting purposes. For example, the "isalpha()" class * is sometimes used to determine word boundaries, while a more sophisticated * approach would at least distinguish initial letters from continuation * characters (the latter including combining marks). * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) * Another example: There is no "istitle()" class for titlecase characters. * * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. * ICU implements them according to the Standard Recommendations in * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). * * API access for C/POSIX character classes is as follows: * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC) * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE) * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE) * - punct: u_ispunct(c) * - digit: u_isdigit(c) or u_charType(c)==U_DECIMAL_DIGIT_NUMBER * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT) * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM) * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE) * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK) * - cntrl: u_charType(c)==U_CONTROL_CHAR * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH) * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT) * * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, * the Standard Recommendations in UTS #18. Instead, they match Java * functions according to their API documentation. * * \htmlonly * The C/POSIX character classes are also available in UnicodeSet patterns, * using patterns like [:graph:] or \p{graph}. * \endhtmlonly * * Note: There are several ICU whitespace functions. * Comparison: * - u_isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; * most of general categories "Z" (separators) + most whitespace ISO controls * (including no-break spaces, but excluding IS1..IS4) * - u_isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces * - u_isJavaSpaceChar: Java isSpaceChar; just Z (including no-break spaces) * - u_isspace: Z + whitespace ISO controls (including no-break spaces) * - u_isblank: "horizontal spaces" = TAB + Zs */ /** * Constants. */ /** The lowest Unicode code point value. Code points are non-negative. @stable ICU 2.0 */ #define UCHAR_MIN_VALUE … /** * The highest Unicode code point value (scalar value) according to * The Unicode Standard. This is a 21-bit value (20.1 bits, rounded up). * For a single character, UChar32 is a simple type that can hold any code point value. * * @see UChar32 * @stable ICU 2.0 */ #define UCHAR_MAX_VALUE … /** * Get a single-bit bit set (a flag) from a bit number 0..31. * @stable ICU 2.1 */ #define U_MASK(x) … /** * Selection constants for Unicode properties. * These constants are used in functions like u_hasBinaryProperty to select * one of the Unicode properties. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * * For details about the properties see * UAX #44: Unicode Character Database (http://www.unicode.org/reports/tr44/). * * Important: If ICU is built with UCD files from Unicode versions below, e.g., 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * Check u_getUnicodeVersion to be sure. * * @see u_hasBinaryProperty * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 2.1 */ UProperty; /** * Data for enumerated Unicode general category types. * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html . * @stable ICU 2.0 */ UCharCategory; /** * U_GC_XX_MASK constants are bit flags corresponding to Unicode * general category values. * For each category, the nth bit is set if the numeric value of the * corresponding UCharCategory constant is n. * * There are also some U_GC_Y_MASK constants for groups of general categories * like L for all letter categories. * * @see u_charType * @see U_GET_GC_MASK * @see UCharCategory * @stable ICU 2.1 */ #define U_GC_CN_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LU_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LL_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LT_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LM_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_LO_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MN_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ME_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_MC_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ND_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NL_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_NO_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZS_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZL_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_ZP_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CC_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CF_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CO_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_CS_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PD_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PS_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PE_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PC_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PO_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SM_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SC_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SK_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_SO_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PI_MASK … /** Mask constant for a UCharCategory. @stable ICU 2.1 */ #define U_GC_PF_MASK … /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ #define U_GC_L_MASK … /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ #define U_GC_LC_MASK … /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ #define U_GC_M_MASK … /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ #define U_GC_N_MASK … /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ #define U_GC_Z_MASK … /** Mask constant for multiple UCharCategory bits (C Others). @stable ICU 2.1 */ #define U_GC_C_MASK … /** Mask constant for multiple UCharCategory bits (P Punctuation). @stable ICU 2.1 */ #define U_GC_P_MASK … /** Mask constant for multiple UCharCategory bits (S Symbols). @stable ICU 2.1 */ #define U_GC_S_MASK … /** * This specifies the language directional property of a character set. * @stable ICU 2.0 */ UCharDirection; /** * Bidi Paired Bracket Type constants. * * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @stable ICU 52 */ UBidiPairedBracketType; /** * Constants for Unicode blocks, see the Unicode Data file Blocks.txt * @stable ICU 2.0 */ enum UBlockCode { … }; /** @stable ICU 2.0 */ UBlockCode; /** * East Asian Width constants. * * @see UCHAR_EAST_ASIAN_WIDTH * @see u_getIntPropertyValue * @stable ICU 2.2 */ UEastAsianWidth; /** * Selector constants for u_charName(). * u_charName() returns the "modern" name of a * Unicode character; or the name that was defined in * Unicode version 1.0, before the Unicode standard merged * with ISO-10646; or an "extended" name that gives each * Unicode code point a unique name. * * @see u_charName * @stable ICU 2.0 */ UCharNameChoice; /** * Selector constants for u_getPropertyName() and * u_getPropertyValueName(). These selectors are used to choose which * name is returned for a given property or value. All properties and * values have a long name. Most have a short name, but some do not. * Unicode allows for additional names, beyond the long and short * name, which would be indicated by U_LONG_PROPERTY_NAME + i, where * i=1, 2,... * * @see u_getPropertyName() * @see u_getPropertyValueName() * @stable ICU 2.4 */ UPropertyNameChoice; /** * Decomposition Type constants. * * @see UCHAR_DECOMPOSITION_TYPE * @stable ICU 2.2 */ UDecompositionType; /** * Joining Type constants. * * @see UCHAR_JOINING_TYPE * @stable ICU 2.2 */ UJoiningType; /** * Joining Group constants. * * @see UCHAR_JOINING_GROUP * @stable ICU 2.2 */ UJoiningGroup; /** * Grapheme Cluster Break constants. * * @see UCHAR_GRAPHEME_CLUSTER_BREAK * @stable ICU 3.4 */ UGraphemeClusterBreak; /** * Word Break constants. * (UWordBreak is a pre-existing enum type in ubrk.h for word break status tags.) * * @see UCHAR_WORD_BREAK * @stable ICU 3.4 */ UWordBreakValues; /** * Sentence Break constants. * * @see UCHAR_SENTENCE_BREAK * @stable ICU 3.4 */ USentenceBreak; /** * Line Break constants. * * @see UCHAR_LINE_BREAK * @stable ICU 2.2 */ ULineBreak; /** * Numeric Type constants. * * @see UCHAR_NUMERIC_TYPE * @stable ICU 2.2 */ UNumericType; /** * Hangul Syllable Type constants. * * @see UCHAR_HANGUL_SYLLABLE_TYPE * @stable ICU 2.6 */ UHangulSyllableType; /** * Indic Positional Category constants. * * @see UCHAR_INDIC_POSITIONAL_CATEGORY * @stable ICU 63 */ UIndicPositionalCategory; /** * Indic Syllabic Category constants. * * @see UCHAR_INDIC_SYLLABIC_CATEGORY * @stable ICU 63 */ UIndicSyllabicCategory; /** * Vertical Orientation constants. * * @see UCHAR_VERTICAL_ORIENTATION * @stable ICU 63 */ UVerticalOrientation; #ifndef U_HIDE_DRAFT_API /** * Identifier Status constants. * See https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type. * * @see UCHAR_IDENTIFIER_STATUS * @draft ICU 75 */ UIdentifierStatus; /** * Identifier Type constants. * See https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type. * * @see UCHAR_IDENTIFIER_TYPE * @draft ICU 75 */ UIdentifierType; #endif // U_HIDE_DRAFT_API /** * Check a binary Unicode property for a code point. * * Unicode, especially in version 3.2, defines many more properties than the * original set in UnicodeData.txt. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ucd/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Important: If ICU is built with UCD files from Unicode versions below 3.2, * then properties marked with "new in Unicode 3.2" are not or not fully available. * * @param c Code point to test. * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT. * @return true or false according to the binary Unicode property value for c. * Also false if 'which' is out of bounds or if the Unicode version * does not have data for the property at all. * * @see UProperty * @see u_getBinaryPropertySet * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_hasBinaryProperty(UChar32 c, UProperty which); /** * Returns true if the property is true for the string. * Same as u_hasBinaryProperty(single code point, which) * if the string contains exactly one code point. * * Most properties apply only to single code points. * <a href="https://www.unicode.org/reports/tr51/#Emoji_Sets">UTS #51 Unicode Emoji</a> * defines several properties of strings. * * @param s String to test. * @param length Length of the string, or negative if NUL-terminated. * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT. * @return true or false according to the binary Unicode property value for the string. * Also false if 'which' is out of bounds or if the Unicode version * does not have data for the property at all. * * @see UProperty * @see u_hasBinaryProperty * @see u_getBinaryPropertySet * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 70 */ U_CAPI UBool U_EXPORT2 u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which); /** * Returns a frozen USet for a binary property. * The library retains ownership over the returned object. * Sets an error code if the property number is not one for a binary property. * * The returned set contains all code points for which the property is true. * * @param property UCHAR_BINARY_START..UCHAR_BINARY_LIMIT-1 * @param pErrorCode an in/out ICU UErrorCode * @return the property as a set * @see UProperty * @see u_hasBinaryProperty * @see Unicode::fromUSet * @stable ICU 63 */ U_CAPI const USet * U_EXPORT2 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode); /** * Check if a code point has the Alphabetic Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC). * This is different from u_isalpha! * @param c Code point to test * @return true if the code point has the Alphabetic Unicode property, false otherwise * * @see UCHAR_ALPHABETIC * @see u_isalpha * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUAlphabetic(UChar32 c); /** * Check if a code point has the Lowercase Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_LOWERCASE). * This is different from u_islower! * @param c Code point to test * @return true if the code point has the Lowercase Unicode property, false otherwise * * @see UCHAR_LOWERCASE * @see u_islower * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isULowercase(UChar32 c); /** * Check if a code point has the Uppercase Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_UPPERCASE). * This is different from u_isupper! * @param c Code point to test * @return true if the code point has the Uppercase Unicode property, false otherwise * * @see UCHAR_UPPERCASE * @see u_isupper * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUUppercase(UChar32 c); /** * Check if a code point has the White_Space Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_WHITE_SPACE). * This is different from both u_isspace and u_isWhitespace! * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c Code point to test * @return true if the code point has the White_Space Unicode property, false otherwise. * * @see UCHAR_WHITE_SPACE * @see u_isWhitespace * @see u_isspace * @see u_isJavaSpaceChar * @see u_hasBinaryProperty * @stable ICU 2.1 */ U_CAPI UBool U_EXPORT2 u_isUWhiteSpace(UChar32 c); /** * Get the property value for an enumerated or integer Unicode property for a code point. * Also returns binary and mask property values. * * Unicode, especially in version 3.2, defines many more properties than the * original set in UnicodeData.txt. * * The properties APIs are intended to reflect Unicode properties as defined * in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). * For details about the properties see http://www.unicode.org/ . * For names of Unicode properties see the UCD file PropertyAliases.txt. * * Sample usage: * UEastAsianWidth ea=(UEastAsianWidth)u_getIntPropertyValue(c, UCHAR_EAST_ASIAN_WIDTH); * UBool b=(UBool)u_getIntPropertyValue(c, UCHAR_IDEOGRAPHIC); * * @param c Code point to test. * @param which UProperty selector constant, identifies which property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. * @return Numeric value that is directly the property value or, * for enumerated properties, corresponds to the numeric value of the enumerated * constant of the respective property value enumeration type * (cast to enum type if necessary). * Returns 0 or 1 (for false/true) for binary Unicode properties. * Returns a bit-mask for mask properties. * Returns 0 if 'which' is out of bounds or if the Unicode version * does not have data for the property at all, or not for this code point. * * @see UProperty * @see u_hasBinaryProperty * @see u_getIntPropertyMinValue * @see u_getIntPropertyMaxValue * @see u_getIntPropertyMap * @see u_getUnicodeVersion * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_getIntPropertyValue(UChar32 c, UProperty which); /** * Get the minimum value for an enumerated/integer/binary Unicode property. * Can be used together with u_getIntPropertyMaxValue * to allocate arrays of UnicodeSet or similar. * * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT. * @return Minimum value returned by u_getIntPropertyValue for a Unicode property. * 0 if the property selector is out of range. * * @see UProperty * @see u_hasBinaryProperty * @see u_getUnicodeVersion * @see u_getIntPropertyMaxValue * @see u_getIntPropertyValue * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_getIntPropertyMinValue(UProperty which); /** * Get the maximum value for an enumerated/integer/binary Unicode property. * Can be used together with u_getIntPropertyMinValue * to allocate arrays of UnicodeSet or similar. * * Examples for min/max values (for Unicode 3.2): * * - UCHAR_BIDI_CLASS: 0/18 (U_LEFT_TO_RIGHT/U_BOUNDARY_NEUTRAL) * - UCHAR_SCRIPT: 0/45 (USCRIPT_COMMON/USCRIPT_TAGBANWA) * - UCHAR_IDEOGRAPHIC: 0/1 (false/true) * * For undefined UProperty constant values, min/max values will be 0/-1. * * @param which UProperty selector constant, identifies which binary property to check. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT. * @return Maximum value returned by u_getIntPropertyValue for a Unicode property. * <=0 if the property selector is out of range. * * @see UProperty * @see u_hasBinaryProperty * @see u_getUnicodeVersion * @see u_getIntPropertyMaxValue * @see u_getIntPropertyValue * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_getIntPropertyMaxValue(UProperty which); /** * Returns an immutable UCPMap for an enumerated/catalog/int-valued property. * The library retains ownership over the returned object. * Sets an error code if the property number is not one for an "int property". * * The returned object maps all Unicode code points to their values for that property. * For documentation of the integer values see u_getIntPropertyValue(). * * @param property UCHAR_INT_START..UCHAR_INT_LIMIT-1 * @param pErrorCode an in/out ICU UErrorCode * @return the property as a map * @see UProperty * @see u_getIntPropertyValue * @stable ICU 63 */ U_CAPI const UCPMap * U_EXPORT2 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode); /** * Get the numeric value for a Unicode code point as defined in the * Unicode Character Database. * * A "double" return type is necessary because * some numeric values are fractions, negative, or too large for int32_t. * * For characters without any numeric values in the Unicode Character Database, * this function will return U_NO_NUMERIC_VALUE. * Note: This is different from the Unicode Standard which specifies NaN as the default value. * (NaN is not available on all platforms.) * * Similar to java.lang.Character.getNumericValue(), but u_getNumericValue() * also supports negative values, large values, and fractions, * while Java's getNumericValue() returns values 10..35 for ASCII letters. * * @param c Code point to get the numeric value for. * @return Numeric value of c, or U_NO_NUMERIC_VALUE if none is defined. * * @see U_NO_NUMERIC_VALUE * @stable ICU 2.2 */ U_CAPI double U_EXPORT2 u_getNumericValue(UChar32 c); /** * Special value that is returned by u_getNumericValue when * no numeric value is defined for a code point. * * @see u_getNumericValue * @stable ICU 2.2 */ #define U_NO_NUMERIC_VALUE … /** * Determines whether the specified code point has the general category "Ll" * (lowercase letter). * * Same as java.lang.Character.isLowerCase(). * * This misses some characters that are also lowercase but * have a different general category value. * In order to include those, use UCHAR_LOWERCASE. * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an Ll lowercase letter * * @see UCHAR_LOWERCASE * @see u_isupper * @see u_istitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_islower(UChar32 c); /** * Determines whether the specified code point has the general category "Lu" * (uppercase letter). * * Same as java.lang.Character.isUpperCase(). * * This misses some characters that are also uppercase but * have a different general category value. * In order to include those, use UCHAR_UPPERCASE. * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an Lu uppercase letter * * @see UCHAR_UPPERCASE * @see u_islower * @see u_istitle * @see u_tolower * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isupper(UChar32 c); /** * Determines whether the specified code point is a titlecase letter. * True for general category "Lt" (titlecase letter). * * Same as java.lang.Character.isTitleCase(). * * @param c the code point to be tested * @return true if the code point is an Lt titlecase letter * * @see u_isupper * @see u_islower * @see u_totitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_istitle(UChar32 c); /** * Determines whether the specified code point is a digit character according to Java. * True for characters with general category "Nd" (decimal digit numbers). * Beginning with Unicode 4, this is the same as * testing for the Numeric_Type of Decimal. * * Same as java.lang.Character.isDigit(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a digit character according to Character.isDigit() * * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isdigit(UChar32 c); /** * Determines whether the specified code point is a letter character. * True for general categories "L" (letters). * * Same as java.lang.Character.isLetter(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a letter character * * @see u_isdigit * @see u_isalnum * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isalpha(UChar32 c); /** * Determines whether the specified code point is an alphanumeric character * (letter or digit) according to Java. * True for characters with general categories * "L" (letters) and "Nd" (decimal digit numbers). * * Same as java.lang.Character.isLetterOrDigit(). * * In addition to being equivalent to a Java function, this also serves * as a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is an alphanumeric character according to Character.isLetterOrDigit() * * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isalnum(UChar32 c); /** * Determines whether the specified code point is a hexadecimal digit. * This is equivalent to u_digit(c, 16)>=0. * True for characters with general category "Nd" (decimal digit numbers) * as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. * (That is, for letters with code points * 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) * * In order to narrow the definition of hexadecimal digits to only ASCII * characters, use (c<=0x7f && u_isxdigit(c)). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a hexadecimal digit * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isxdigit(UChar32 c); /** * Determines whether the specified code point is a punctuation character. * True for characters with general categories "P" (punctuation). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a punctuation character * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_ispunct(UChar32 c); /** * Determines whether the specified code point is a "graphic" character * (printable, excluding spaces). * true for all characters except those with general categories * "Cc" (control codes), "Cf" (format controls), "Cs" (surrogates), * "Cn" (unassigned), and "Z" (separators). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a "graphic" character * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isgraph(UChar32 c); /** * Determines whether the specified code point is a "blank" or "horizontal space", * a character that visibly separates words on a line. * The following are equivalent definitions: * * true for Unicode White_Space characters except for "vertical space controls" * where "vertical space controls" are the following characters: * U+000A (LF) U+000B (VT) U+000C (FF) U+000D (CR) U+0085 (NEL) U+2028 (LS) U+2029 (PS) * * same as * * true for U+0009 (TAB) and characters with general category "Zs" (space separators). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a "blank" * * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isblank(UChar32 c); /** * Determines whether the specified code point is "defined", * which usually means that it is assigned a character. * True for general categories other than "Cn" (other, not assigned), * i.e., true for all code points mentioned in UnicodeData.txt. * * Note that non-character code points (e.g., U+FDD0) are not "defined" * (they are Cn), but surrogate code points are "defined" (Cs). * * Same as java.lang.Character.isDefined(). * * @param c the code point to be tested * @return true if the code point is assigned a character * * @see u_isdigit * @see u_isalpha * @see u_isalnum * @see u_isupper * @see u_islower * @see u_istitle * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isdefined(UChar32 c); /** * Determines if the specified character is a space character or not. * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the character to be tested * @return true if the character is a space character; false otherwise. * * @see u_isJavaSpaceChar * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isspace(UChar32 c); /** * Determine if the specified code point is a space character according to Java. * True for characters with general categories "Z" (separators), * which does not include control codes (e.g., TAB or Line Feed). * * Same as java.lang.Character.isSpaceChar(). * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return true if the code point is a space character according to Character.isSpaceChar() * * @see u_isspace * @see u_isWhitespace * @see u_isUWhiteSpace * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isJavaSpaceChar(UChar32 c); /** * Determines if the specified code point is a whitespace character according to Java/ICU. * A character is considered to be a Java whitespace character if and only * if it satisfies one of the following criteria: * * - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not * also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space or U+202F Narrow NBSP). * - It is U+0009 HORIZONTAL TABULATION. * - It is U+000A LINE FEED. * - It is U+000B VERTICAL TABULATION. * - It is U+000C FORM FEED. * - It is U+000D CARRIAGE RETURN. * - It is U+001C FILE SEPARATOR. * - It is U+001D GROUP SEPARATOR. * - It is U+001E RECORD SEPARATOR. * - It is U+001F UNIT SEPARATOR. * * This API tries to sync with the semantics of Java's * java.lang.Character.isWhitespace(), but it may not return * the exact same results because of the Unicode version * difference. * * Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs) * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false. * See http://www.unicode.org/versions/Unicode4.0.1/ * * Note: There are several ICU whitespace functions; please see the uchar.h * file documentation for a detailed comparison. * * @param c the code point to be tested * @return true if the code point is a whitespace character according to Java/ICU * * @see u_isspace * @see u_isJavaSpaceChar * @see u_isUWhiteSpace * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isWhitespace(UChar32 c); /** * Determines whether the specified code point is a control character * (as defined by this function). * A control character is one of the following: * - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) * - U_CONTROL_CHAR (Cc) * - U_FORMAT_CHAR (Cf) * - U_LINE_SEPARATOR (Zl) * - U_PARAGRAPH_SEPARATOR (Zp) * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a control character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_isprint * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_iscntrl(UChar32 c); /** * Determines whether the specified code point is an ISO control code. * True for U+0000..U+001f and U+007f..U+009f (general category "Cc"). * * Same as java.lang.Character.isISOControl(). * * @param c the code point to be tested * @return true if the code point is an ISO control code * * @see u_iscntrl * @stable ICU 2.6 */ U_CAPI UBool U_EXPORT2 u_isISOControl(UChar32 c); /** * Determines whether the specified code point is a printable character. * True for general categories <em>other</em> than "C" (controls). * * This is a C/POSIX migration function. * See the comments about C/POSIX character classification functions in the * documentation at the top of this header file. * * @param c the code point to be tested * @return true if the code point is a printable character * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_iscntrl * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isprint(UChar32 c); /** * Non-standard: Determines whether the specified code point is a base character. * True for general categories "L" (letters), "N" (numbers), * "Mc" (spacing combining marks), and "Me" (enclosing marks). * * Note that this is different from the Unicode Standard definition in * chapter 3.6, conformance clause D51 “Base character”, * which defines base characters as the code points with general categories * Letter (L), Number (N), Punctuation (P), Symbol (S), or Space Separator (Zs). * * @param c the code point to be tested * @return true if the code point is a base character according to this function * * @see u_isalpha * @see u_isdigit * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isbase(UChar32 c); /** * Returns the bidirectional category value for the code point, * which is used in the Unicode bidirectional algorithm * (UAX #9 http://www.unicode.org/reports/tr9/). * Note that some <em>unassigned</em> code points have bidi values * of R or AL because they are in blocks that are reserved * for Right-To-Left scripts. * * Same as java.lang.Character.getDirectionality() * * @param c the code point to be tested * @return the bidirectional category (UCharDirection) value * * @see UCharDirection * @stable ICU 2.0 */ U_CAPI UCharDirection U_EXPORT2 u_charDirection(UChar32 c); /** * Determines whether the code point has the Bidi_Mirrored property. * This property is set for characters that are commonly used in * Right-To-Left contexts and need to be displayed with a "mirrored" * glyph. * * Same as java.lang.Character.isMirrored(). * Same as UCHAR_BIDI_MIRRORED * * @param c the code point to be tested * @return true if the character has the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isMirrored(UChar32 c); /** * Maps the specified character to a "mirror-image" character. * For characters with the Bidi_Mirrored property, implementations * sometimes need a "poor man's" mapping to another Unicode * character (code point) such that the default glyph may serve * as the mirror-image of the default glyph of the specified * character. This is useful for text conversion to and from * codepages with visual order, and for displays without glyph * selection capabilities. * * @param c the code point to be mapped * @return another Unicode code point that may serve as a mirror-image * substitute, or c itself if there is no such mapping or c * does not have the Bidi_Mirrored property * * @see UCHAR_BIDI_MIRRORED * @see u_isMirrored * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_charMirror(UChar32 c); /** * Maps the specified character to its paired bracket character. * For Bidi_Paired_Bracket_Type!=None, this is the same as u_charMirror(). * Otherwise c itself is returned. * See http://www.unicode.org/reports/tr9/ * * @param c the code point to be mapped * @return the paired bracket code point, * or c itself if there is no such mapping * (Bidi_Paired_Bracket_Type=None) * * @see UCHAR_BIDI_PAIRED_BRACKET * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @see u_charMirror * @stable ICU 52 */ U_CAPI UChar32 U_EXPORT2 u_getBidiPairedBracket(UChar32 c); /** * Returns the general category value for the code point. * * Same as java.lang.Character.getType(). * * @param c the code point to be tested * @return the general category (UCharCategory) value * * @see UCharCategory * @stable ICU 2.0 */ U_CAPI int8_t U_EXPORT2 u_charType(UChar32 c); /** * Get a single-bit bit set for the general category of a character. * This bit set can be compared bitwise with U_GC_SM_MASK, U_GC_L_MASK, etc. * Same as U_MASK(u_charType(c)). * * @param c the code point to be tested * @return a single-bit mask corresponding to the general category (UCharCategory) value * * @see u_charType * @see UCharCategory * @see U_GC_CN_MASK * @stable ICU 2.1 */ #define U_GET_GC_MASK(c) … /** * Callback from u_enumCharTypes(), is called for each contiguous range * of code points c (where start<=c<limit) * with the same Unicode general category ("character type"). * * The callback function can stop the enumeration by returning false. * * @param context an opaque pointer, as passed into utrie_enum() * @param start the first code point in a contiguous range with value * @param limit one past the last code point in a contiguous range with value * @param type the general category for all code points in [start..limit[ * @return false to stop the enumeration * * @stable ICU 2.1 * @see UCharCategory * @see u_enumCharTypes */ UCharEnumTypeRange; /** * Enumerate efficiently all code points with their Unicode general categories. * * This is useful for building data structures (e.g., UnicodeSet's), * for enumerating all assigned code points (type!=U_UNASSIGNED), etc. * * For each contiguous range of code points with a given general category ("character type"), * the UCharEnumTypeRange function is called. * Adjacent ranges have different types. * The Unicode Standard guarantees that the numeric value of the type is 0..31. * * @param enumRange a pointer to a function that is called for each contiguous range * of code points with the same general category * @param context an opaque pointer that is passed on to the callback function * * @stable ICU 2.1 * @see UCharCategory * @see UCharEnumTypeRange */ U_CAPI void U_EXPORT2 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context); #if !UCONFIG_NO_NORMALIZATION /** * Returns the combining class of the code point as specified in UnicodeData.txt. * * @param c the code point of the character * @return the combining class of the character * @stable ICU 2.0 */ U_CAPI uint8_t U_EXPORT2 u_getCombiningClass(UChar32 c); #endif /** * Returns the decimal digit value of a decimal digit character. * Such characters have the general category "Nd" (decimal digit numbers) * and a Numeric_Type of Decimal. * * Unlike ICU releases before 2.6, no digit values are returned for any * Han characters because Han number characters are often used with a special * Chinese-style number format (with characters for powers of 10 in between) * instead of in decimal-positional notation. * Unicode 4 explicitly assigns Han number characters the Numeric_Type * Numeric instead of Decimal. * See Jitterbug 1483 for more details. * * Use u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE) and u_getNumericValue() * for complete numeric Unicode properties. * * @param c the code point for which to get the decimal digit value * @return the decimal digit value of c, * or -1 if c is not a decimal digit character * * @see u_getNumericValue * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_charDigitValue(UChar32 c); /** * Returns the Unicode allocation block that contains the character. * * @param c the code point to be tested * @return the block value (UBlockCode) for c * * @see UBlockCode * @stable ICU 2.0 */ U_CAPI UBlockCode U_EXPORT2 ublock_getCode(UChar32 c); /** * Retrieve the name of a Unicode character. * Depending on <code>nameChoice</code>, the character name written * into the buffer is the "modern" name or the name that was defined * in Unicode version 1.0. * The name contains only "invariant" characters * like A-Z, 0-9, space, and '-'. * Unicode 1.0 names are only retrieved if they are different from the modern * names and if the data file contains the data for them. gennames may or may * not be called with a command line option to include 1.0 names in unames.dat. * * @param code The character (code point) for which to get the name. * It must be <code>0<=code<=0x10ffff</code>. * @param nameChoice Selector for which name to get. * @param buffer Destination address for copying the name. * The name will always be zero-terminated. * If there is no name, then the buffer will be set to the empty string. * @param bufferLength <code>==sizeof(buffer)</code> * @param pErrorCode Pointer to a UErrorCode variable; * check for <code>U_SUCCESS()</code> after <code>u_charName()</code> * returns. * @return The length of the name, or 0 if there is no name for this character. * If the bufferLength is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @see UCharNameChoice * @see u_charFromName * @see u_enumCharNames * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode); #ifndef U_HIDE_DEPRECATED_API /** * Returns an empty string. * Used to return the ISO 10646 comment for a character. * The Unicode ISO_Comment property is deprecated and has no values. * * @param c The character (code point) for which to get the ISO comment. * It must be <code>0<=c<=0x10ffff</code>. * @param dest Destination address for copying the comment. * The comment will be zero-terminated if possible. * If there is no comment, then the buffer will be set to the empty string. * @param destCapacity <code>==sizeof(dest)</code> * @param pErrorCode Pointer to a UErrorCode variable; * check for <code>U_SUCCESS()</code> after <code>u_getISOComment()</code> * returns. * @return 0 * * @deprecated ICU 49 */ U_DEPRECATED int32_t U_EXPORT2 u_getISOComment(UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode); #endif /* U_HIDE_DEPRECATED_API */ /** * Find a Unicode character by its name and return its code point value. * The name is matched exactly and completely. * If the name does not correspond to a code point, <i>pErrorCode</i> * is set to <code>U_INVALID_CHAR_FOUND</code>. * A Unicode 1.0 name is matched only if it differs from the modern name. * Unicode names are all uppercase. Extended names are lowercase followed * by an uppercase hexadecimal number, and within angle brackets. * * @param nameChoice Selector for which name to match. * @param name The name to match. * @param pErrorCode Pointer to a UErrorCode variable * @return The Unicode value of the code point with the given name, * or an undefined value if there is no such code point. * * @see UCharNameChoice * @see u_charName * @see u_enumCharNames * @stable ICU 1.7 */ U_CAPI UChar32 U_EXPORT2 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode); /** * Type of a callback function for u_enumCharNames() that gets called * for each Unicode character with the code point value and * the character name. * If such a function returns false, then the enumeration is stopped. * * @param context The context pointer that was passed to u_enumCharNames(). * @param code The Unicode code point for the character with this name. * @param nameChoice Selector for which kind of names is enumerated. * @param name The character's name, zero-terminated. * @param length The length of the name. * @return true if the enumeration should continue, false to stop it. * * @see UCharNameChoice * @see u_enumCharNames * @stable ICU 1.7 */ UEnumCharNamesFn; /** * Enumerate all assigned Unicode characters between the start and limit * code points (start inclusive, limit exclusive) and call a function * for each, passing the code point value and the character name. * For Unicode 1.0 names, only those are enumerated that differ from the * modern names. * * @param start The first code point in the enumeration range. * @param limit One more than the last code point in the enumeration range * (the first one after the range). * @param fn The function that is to be called for each character name. * @param context An arbitrary pointer that is passed to the function. * @param nameChoice Selector for which kind of names to enumerate. * @param pErrorCode Pointer to a UErrorCode variable * * @see UCharNameChoice * @see UEnumCharNamesFn * @see u_charName * @see u_charFromName * @stable ICU 1.7 */ U_CAPI void U_EXPORT2 u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode); /** * Return the Unicode name for a given property, as given in the * Unicode database file PropertyAliases.txt. * * In addition, this function maps the property * UCHAR_GENERAL_CATEGORY_MASK to the synthetic names "gcm" / * "General_Category_Mask". These names are not in * PropertyAliases.txt. * * @param property UProperty selector other than UCHAR_INVALID_CODE. * If out of range, NULL is returned. * * @param nameChoice selector for which name to get. If out of range, * NULL is returned. All properties have a long name. Most * have a short name, but some do not. Unicode allows for * additional names; if present these will be returned by * U_LONG_PROPERTY_NAME + i, where i=1, 2,... * * @return a pointer to the name, or NULL if either the * property or the nameChoice is out of range. If a given * nameChoice returns NULL, then all larger values of * nameChoice will return NULL, with one exception: if NULL is * returned for U_SHORT_PROPERTY_NAME, then * U_LONG_PROPERTY_NAME (and higher) may still return a * non-NULL value. The returned pointer is valid until * u_cleanup() is called. * * @see UProperty * @see UPropertyNameChoice * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 u_getPropertyName(UProperty property, UPropertyNameChoice nameChoice); /** * Return the UProperty enum for a given property name, as specified * in the Unicode database file PropertyAliases.txt. Short, long, and * any other variants are recognized. * * In addition, this function maps the synthetic names "gcm" / * "General_Category_Mask" to the property * UCHAR_GENERAL_CATEGORY_MASK. These names are not in * PropertyAliases.txt. * * @param alias the property name to be matched. The name is compared * using "loose matching" as described in PropertyAliases.txt. * * @return a UProperty enum, or UCHAR_INVALID_CODE if the given name * does not match any property. * * @see UProperty * @stable ICU 2.4 */ U_CAPI UProperty U_EXPORT2 u_getPropertyEnum(const char* alias); /** * Return the Unicode name for a given property value, as given in the * Unicode database file PropertyValueAliases.txt. * * Note: Some of the names in PropertyValueAliases.txt can only be * retrieved using UCHAR_GENERAL_CATEGORY_MASK, not * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". * * @param property UProperty selector constant. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. * If out of range, NULL is returned. * * @param value selector for a value for the given property. If out * of range, NULL is returned. In general, valid values range * from 0 up to some maximum. There are a few exceptions: * (1.) UCHAR_BLOCK values begin at the non-zero value * UBLOCK_BASIC_LATIN. (2.) UCHAR_CANONICAL_COMBINING_CLASS * values are not contiguous and range from 0..240. (3.) * UCHAR_GENERAL_CATEGORY_MASK values are not values of * UCharCategory, but rather mask values produced by * U_GET_GC_MASK(). This allows grouped categories such as * [:L:] to be represented. Mask values range * non-contiguously from 1..U_GC_P_MASK. * * @param nameChoice selector for which name to get. If out of range, * NULL is returned. All values have a long name. Most have * a short name, but some do not. Unicode allows for * additional names; if present these will be returned by * U_LONG_PROPERTY_NAME + i, where i=1, 2,... * @return a pointer to the name, or NULL if either the * property or the nameChoice is out of range. If a given * nameChoice returns NULL, then all larger values of * nameChoice will return NULL, with one exception: if NULL is * returned for U_SHORT_PROPERTY_NAME, then * U_LONG_PROPERTY_NAME (and higher) may still return a * non-NULL value. The returned pointer is valid until * u_cleanup() is called. * * @see UProperty * @see UPropertyNameChoice * @stable ICU 2.4 */ U_CAPI const char* U_EXPORT2 u_getPropertyValueName(UProperty property, int32_t value, UPropertyNameChoice nameChoice); /** * Return the property value integer for a given value name, as * specified in the Unicode database file PropertyValueAliases.txt. * Short, long, and any other variants are recognized. * * Note: Some of the names in PropertyValueAliases.txt will only be * recognized with UCHAR_GENERAL_CATEGORY_MASK, not * UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" / * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P" * / "Punctuation", "S" / "Symbol", and "Z" / "Separator". * * @param property UProperty selector constant. * Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT * or UCHAR_INT_START<=which<UCHAR_INT_LIMIT * or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT. * If out of range, UCHAR_INVALID_CODE is returned. * * @param alias the value name to be matched. The name is compared * using "loose matching" as described in * PropertyValueAliases.txt. * * @return a value integer or UCHAR_INVALID_CODE if the given name * does not match any value of the given property, or if the * property is invalid. Note: UCHAR_GENERAL_CATEGORY_MASK values * are not values of UCharCategory, but rather mask values * produced by U_GET_GC_MASK(). This allows grouped * categories such as [:L:] to be represented. * * @see UProperty * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 u_getPropertyValueEnum(UProperty property, const char* alias); /** * Determines if the specified character is permissible as the first character in an identifier * according to UAX #31 Unicode Identifier and Pattern Syntax. * * Same as Unicode ID_Start (UCHAR_ID_START). * * @param c the code point to be tested * @return true if the code point may start an identifier * * @see UCHAR_ID_START * @see u_isalpha * @see u_isIDPart * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isIDStart(UChar32 c); /** * Determines if the specified character is permissible as a non-initial character of an identifier * according to UAX #31 Unicode Identifier and Pattern Syntax. * * Same as Unicode ID_Continue (UCHAR_ID_CONTINUE). * * @param c the code point to be tested * @return true if the code point may occur as a non-initial character of an identifier * * @see UCHAR_ID_CONTINUE * @see u_isIDStart * @see u_isIDIgnorable * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isIDPart(UChar32 c); #ifndef U_HIDE_DRAFT_API /** * Does the set of Identifier_Type values code point c contain the given type? * * Used for UTS #39 General Security Profile for Identifiers * (https://www.unicode.org/reports/tr39/#General_Security_Profile). * * Each code point maps to a <i>set</i> of UIdentifierType values. * * @param c code point * @param type Identifier_Type to check * @return true if type is in Identifier_Type(c) * @draft ICU 75 */ U_CAPI bool U_EXPORT2 u_hasIDType(UChar32 c, UIdentifierType type); /** * Writes code point c's Identifier_Type as a list of UIdentifierType values * to the output types array and returns the number of types. * * Used for UTS #39 General Security Profile for Identifiers * (https://www.unicode.org/reports/tr39/#General_Security_Profile). * * Each code point maps to a <i>set</i> of UIdentifierType values. * There is always at least one type. * The order of output values is undefined. * Each type is output at most once; * there cannot be more output values than UIdentifierType constants. * In addition, only some of the types can be combined with others, * and usually only a small number of types occur together. * Future versions might add additional types. * See UTS #39 and its data files for details. * * If there are more than capacity types to be written, then * U_BUFFER_OVERFLOW_ERROR is set and the number of types is returned. * (Usual ICU buffer handling behavior.) * * @param c code point * @param types output array * @param capacity capacity of the array * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns * immediately. Check for U_FAILURE() on output or use with * function chaining. (See User Guide for details.) * @return number of values in c's Identifier_Type, * written to types unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity * @draft ICU 75 */ U_CAPI int32_t U_EXPORT2 u_getIDTypes(UChar32 c, UIdentifierType *types, int32_t capacity, UErrorCode *pErrorCode); #endif // U_HIDE_DRAFT_API /** * Determines if the specified character should be regarded * as an ignorable character in an identifier, * according to Java. * True for characters with general category "Cf" (format controls) as well as * non-whitespace ISO controls * (U+0000..U+0008, U+000E..U+001B, U+007F..U+009F). * * Same as java.lang.Character.isIdentifierIgnorable(). * * Note that Unicode just recommends to ignore Cf (format controls). * * @param c the code point to be tested * @return true if the code point is ignorable in identifiers according to Java * * @see UCHAR_DEFAULT_IGNORABLE_CODE_POINT * @see u_isIDStart * @see u_isIDPart * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isIDIgnorable(UChar32 c); /** * Determines if the specified character is permissible as the * first character in a Java identifier. * In addition to u_isIDStart(c), true for characters with * general categories "Sc" (currency symbols) and "Pc" (connecting punctuation). * * Same as java.lang.Character.isJavaIdentifierStart(). * * @param c the code point to be tested * @return true if the code point may start a Java identifier * * @see u_isJavaIDPart * @see u_isalpha * @see u_isIDStart * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isJavaIDStart(UChar32 c); /** * Determines if the specified character is permissible * in a Java identifier. * In addition to u_isIDPart(c), true for characters with * general category "Sc" (currency symbols). * * Same as java.lang.Character.isJavaIdentifierPart(). * * @param c the code point to be tested * @return true if the code point may occur in a Java identifier * * @see u_isIDIgnorable * @see u_isJavaIDStart * @see u_isalpha * @see u_isdigit * @see u_isIDPart * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isJavaIDPart(UChar32 c); /** * The given character is mapped to its lowercase equivalent according to * UnicodeData.txt; if the character has no lowercase equivalent, the character * itself is returned. * * Same as java.lang.Character.toLowerCase(). * * This function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They take into account the string context and the language and can map * to a result string with a different length as appropriate. * Full case mappings are applied by the string case mapping functions, * see ustring.h and the UnicodeString class. * See also the User Guide chapter on C/POSIX migration: * https://unicode-org.github.io/icu/userguide/icu/posix#case-mappings * * @param c the code point to be mapped * @return the Simple_Lowercase_Mapping of the code point, if any; * otherwise the code point itself. * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_tolower(UChar32 c); /** * The given character is mapped to its uppercase equivalent according to UnicodeData.txt; * if the character has no uppercase equivalent, the character itself is * returned. * * Same as java.lang.Character.toUpperCase(). * * This function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They take into account the string context and the language and can map * to a result string with a different length as appropriate. * Full case mappings are applied by the string case mapping functions, * see ustring.h and the UnicodeString class. * See also the User Guide chapter on C/POSIX migration: * https://unicode-org.github.io/icu/userguide/icu/posix#case-mappings * * @param c the code point to be mapped * @return the Simple_Uppercase_Mapping of the code point, if any; * otherwise the code point itself. * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_toupper(UChar32 c); /** * The given character is mapped to its titlecase equivalent * according to UnicodeData.txt; * if none is defined, the character itself is returned. * * Same as java.lang.Character.toTitleCase(). * * This function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They take into account the string context and the language and can map * to a result string with a different length as appropriate. * Full case mappings are applied by the string case mapping functions, * see ustring.h and the UnicodeString class. * See also the User Guide chapter on C/POSIX migration: * https://unicode-org.github.io/icu/userguide/icu/posix#case-mappings * * @param c the code point to be mapped * @return the Simple_Titlecase_Mapping of the code point, if any; * otherwise the code point itself. * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_totitle(UChar32 c); /** * The given character is mapped to its case folding equivalent according to * UnicodeData.txt and CaseFolding.txt; * if the character has no case folding equivalent, the character * itself is returned. * * This function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They take into account the string context and the language and can map * to a result string with a different length as appropriate. * Full case mappings are applied by the string case mapping functions, * see ustring.h and the UnicodeString class. * See also the User Guide chapter on C/POSIX migration: * https://unicode-org.github.io/icu/userguide/icu/posix#case-mappings * * @param c the code point to be mapped * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I * @return the Simple_Case_Folding of the code point, if any; * otherwise the code point itself. * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_foldCase(UChar32 c, uint32_t options); /** * Returns the decimal digit value of the code point in the * specified radix. * * If the radix is not in the range <code>2<=radix<=36</code> or if the * value of <code>c</code> is not a valid digit in the specified * radix, <code>-1</code> is returned. A character is a valid digit * if at least one of the following is true: * <ul> * <li>The character has a decimal digit value. * Such characters have the general category "Nd" (decimal digit numbers) * and a Numeric_Type of Decimal. * In this case the value is the character's decimal digit value.</li> * <li>The character is one of the uppercase Latin letters * <code>'A'</code> through <code>'Z'</code>. * In this case the value is <code>c-'A'+10</code>.</li> * <li>The character is one of the lowercase Latin letters * <code>'a'</code> through <code>'z'</code>. * In this case the value is <code>ch-'a'+10</code>.</li> * <li>Latin letters from both the ASCII range (0061..007A, 0041..005A) * as well as from the Fullwidth ASCII range (FF41..FF5A, FF21..FF3A) * are recognized.</li> * </ul> * * Same as java.lang.Character.digit(). * * @param ch the code point to be tested. * @param radix the radix. * @return the numeric value represented by the character in the * specified radix, * or -1 if there is no value or if the value exceeds the radix. * * @see UCHAR_NUMERIC_TYPE * @see u_forDigit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_CAPI int32_t U_EXPORT2 u_digit(UChar32 ch, int8_t radix); /** * Determines the character representation for a specific digit in * the specified radix. If the value of <code>radix</code> is not a * valid radix, or the value of <code>digit</code> is not a valid * digit in the specified radix, the null character * (<code>U+0000</code>) is returned. * <p> * The <code>radix</code> argument is valid if it is greater than or * equal to 2 and less than or equal to 36. * The <code>digit</code> argument is valid if * <code>0 <= digit < radix</code>. * <p> * If the digit is less than 10, then * <code>'0' + digit</code> is returned. Otherwise, the value * <code>'a' + digit - 10</code> is returned. * * Same as java.lang.Character.forDigit(). * * @param digit the number to convert to a character. * @param radix the radix. * @return the <code>char</code> representation of the specified digit * in the specified radix. * * @see u_digit * @see u_charDigitValue * @see u_isdigit * @stable ICU 2.0 */ U_CAPI UChar32 U_EXPORT2 u_forDigit(int32_t digit, int8_t radix); /** * Get the "age" of the code point. * The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) * or assigned a character. * This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters. * The data is from the UCD file DerivedAge.txt. * * @param c The code point. * @param versionArray The Unicode version number array, to be filled in. * * @stable ICU 2.1 */ U_CAPI void U_EXPORT2 u_charAge(UChar32 c, UVersionInfo versionArray); /** * Gets the Unicode version information. * The version array is filled in with the version information * for the Unicode standard that is currently used by ICU. * For example, Unicode version 3.1.1 is represented as an array with * the values { 3, 1, 1, 0 }. * * @param versionArray an output array that will be filled in with * the Unicode version number * @stable ICU 2.0 */ U_CAPI void U_EXPORT2 u_getUnicodeVersion(UVersionInfo versionArray); #if !UCONFIG_NO_NORMALIZATION /** * Get the FC_NFKC_Closure property string for a character. * See Unicode Standard Annex #15 for details, search for "FC_NFKC_Closure" * or for "FNC": http://www.unicode.org/reports/tr15/ * * @param c The character (code point) for which to get the FC_NFKC_Closure string. * It must be <code>0<=c<=0x10ffff</code>. * @param dest Destination address for copying the string. * The string will be zero-terminated if possible. * If there is no FC_NFKC_Closure string, * then the buffer will be set to the empty string. * @param destCapacity <code>==sizeof(dest)</code> * @param pErrorCode Pointer to a UErrorCode variable. * @return The length of the string, or 0 if there is no FC_NFKC_Closure string for this character. * If the destCapacity is less than or equal to the length, then the buffer * contains the truncated name and the returned length indicates the full * length of the name. * The length does not include the zero-termination. * * @stable ICU 2.2 */ U_CAPI int32_t U_EXPORT2 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode); #endif U_CDECL_END #endif /*_UCHAR*/ /*eof*/