chromium/third_party/icu/source/common/unicode/uscript.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 1997-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
 * File USCRIPT.H
 *
 * Modification History:
 *
 *   Date        Name        Description
 *   07/06/2001    Ram         Creation.
 ******************************************************************************
 */

#ifndef USCRIPT_H
#define USCRIPT_H
#include "unicode/utypes.h"

/**
 * \file
 * \brief C API: Unicode Script Information
 */

/**
 * Constants for ISO 15924 script codes.
 *
 * The current set of script code constants supports at least all scripts
 * that are encoded in the version of Unicode which ICU currently supports.
 * The names of the constants are usually derived from the
 * Unicode script property value aliases.
 * See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/)
 * and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt .
 *
 * In addition, constants for many ISO 15924 script codes
 * are included, for use with language tags, CLDR data, and similar.
 * Some of those codes are not used in the Unicode Character Database (UCD).
 * For example, there are no characters that have a UCD script property value of
 * Hans or Hant. All Han ideographs have the Hani script property value in Unicode.
 *
 * Private-use codes Qaaa..Qabx are not included, except as used in the UCD or in CLDR.
 *
 * Starting with ICU 55, script codes are only added when their scripts
 * have been or will certainly be encoded in Unicode,
 * and have been assigned Unicode script property value aliases,
 * to ensure that their script names are stable and match the names of the constants.
 * Script codes like Latf and Aran that are not subject to separate encoding
 * may be added at any time.
 *
 * @stable ICU 2.2
 */
UScriptCode;

/**
 * Gets the script codes associated with the given locale or ISO 15924 abbreviation or name.
 * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym".
 * Fills in USCRIPT_LATIN given "en" OR "en_US"
 * If the required capacity is greater than the capacity of the destination buffer,
 * then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned.
 *
 * <p>Note: To search by short or long script alias only, use
 * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead.  That does
 * a fast lookup with no access of the locale data.
 *
 * @param nameOrAbbrOrLocale name of the script, as given in
 * PropertyValueAliases.txt, or ISO 15924 code or locale
 * @param fillIn the UScriptCode buffer to fill in the script code
 * @param capacity the capacity (size) of UScriptCode buffer passed in.
 * @param err the error status code.
 * @return The number of script codes filled in the buffer passed in
 * @stable ICU 2.4
 */
U_CAPI int32_t  U_EXPORT2
uscript_getCode(const char* nameOrAbbrOrLocale,UScriptCode* fillIn,int32_t capacity,UErrorCode *err);

/**
 * Returns the long Unicode script name, if there is one.
 * Otherwise returns the 4-letter ISO 15924 script code.
 * Returns "Malayam" given USCRIPT_MALAYALAM.
 *
 * @param scriptCode UScriptCode enum
 * @return long script name as given in PropertyValueAliases.txt, or the 4-letter code,
 * or NULL if scriptCode is invalid
 * @stable ICU 2.4
 */
U_CAPI const char*  U_EXPORT2
uscript_getName(UScriptCode scriptCode);

/**
 * Returns the 4-letter ISO 15924 script code,
 * which is the same as the short Unicode script name if Unicode has names for the script.
 * Returns "Mlym" given USCRIPT_MALAYALAM.
 *
 * @param scriptCode UScriptCode enum
 * @return short script name (4-letter code), or NULL if scriptCode is invalid
 * @stable ICU 2.4
 */
U_CAPI const char*  U_EXPORT2
uscript_getShortName(UScriptCode scriptCode);

/**
 * Gets the script code associated with the given codepoint.
 * Returns USCRIPT_MALAYALAM given 0x0D02
 * @param codepoint UChar32 codepoint
 * @param err the error status code.
 * @return The UScriptCode, or 0 if codepoint is invalid
 * @stable ICU 2.4
 */
U_CAPI UScriptCode  U_EXPORT2
uscript_getScript(UChar32 codepoint, UErrorCode *err);

/**
 * Do the Script_Extensions of code point c contain script sc?
 * If c does not have explicit Script_Extensions, then this tests whether
 * c has the Script property value sc.
 *
 * Some characters are commonly used in multiple scripts.
 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
 * @param c code point
 * @param sc script code
 * @return true if sc is in Script_Extensions(c)
 * @stable ICU 49
 */
U_CAPI UBool U_EXPORT2
uscript_hasScript(UChar32 c, UScriptCode sc);

/**
 * Writes code point c's Script_Extensions as a list of UScriptCode values
 * to the output scripts array and returns the number of script codes.
 * - If c does have Script_Extensions, then the Script property value
 *   (normally Common or Inherited) is not included.
 * - If c does not have Script_Extensions, then the one Script code is written to the output array.
 * - If c is not a valid code point, then the one USCRIPT_UNKNOWN code is written.
 * In other words, if the return value is 1,
 * then the output array contains exactly c's single Script code.
 * If the return value is n>=2, then the output array contains c's n Script_Extensions script codes.
 *
 * Some characters are commonly used in multiple scripts.
 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
 *
 * If there are more than capacity script codes to be written, then
 * U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned.
 * (Usual ICU buffer handling behavior.)
 *
 * @param c code point
 * @param scripts output script code array
 * @param capacity capacity of the scripts array
 * @param errorCode Standard ICU error code. Its input value must
 *                  pass the U_SUCCESS() test, or else the function returns
 *                  immediately. Check for U_FAILURE() on output or use with
 *                  function chaining. (See User Guide for details.)
 * @return number of script codes in c's Script_Extensions, or 1 for the single Script value,
 *         written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity
 * @stable ICU 49
 */
U_CAPI int32_t U_EXPORT2
uscript_getScriptExtensions(UChar32 c,
                            UScriptCode *scripts, int32_t capacity,
                            UErrorCode *errorCode);

/**
 * Script usage constants.
 * See UAX #31 Unicode Identifier and Pattern Syntax.
 * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers
 *
 * @stable ICU 51
 */
UScriptUsage;

/**
 * Writes the script sample character string.
 * This string normally consists of one code point but might be longer.
 * The string is empty if the script is not encoded.
 *
 * @param script script code
 * @param dest output string array
 * @param capacity number of UChars in the dest array
 * @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input
 * @return the string length, even if U_BUFFER_OVERFLOW_ERROR
 * @stable ICU 51
 */
U_CAPI int32_t U_EXPORT2
uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode);

#if U_SHOW_CPLUSPLUS_API

U_NAMESPACE_BEGIN
class UnicodeString;
U_NAMESPACE_END

/**
 * Returns the script sample character string.
 * This string normally consists of one code point but might be longer.
 * The string is empty if the script is not encoded.
 *
 * @param script script code
 * @return the sample character string
 * @stable ICU 51
 */
U_COMMON_API icu::UnicodeString U_EXPORT2
uscript_getSampleUnicodeString(UScriptCode script);

#endif

/**
 * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.
 * Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode.
 *
 * @param script script code
 * @return script usage
 * @see UScriptUsage
 * @stable ICU 51
 */
U_CAPI UScriptUsage U_EXPORT2
uscript_getUsage(UScriptCode script);

/**
 * Returns true if the script is written right-to-left.
 * For example, Arab and Hebr.
 *
 * @param script script code
 * @return true if the script is right-to-left
 * @stable ICU 51
 */
U_CAPI UBool U_EXPORT2
uscript_isRightToLeft(UScriptCode script);

/**
 * Returns true if the script allows line breaks between letters (excluding hyphenation).
 * Such a script typically requires dictionary-based line breaking.
 * For example, Hani and Thai.
 *
 * @param script script code
 * @return true if the script allows line breaks between letters
 * @stable ICU 51
 */
U_CAPI UBool U_EXPORT2
uscript_breaksBetweenLetters(UScriptCode script);

/**
 * Returns true if in modern (or most recent) usage of the script case distinctions are customary.
 * For example, Latn and Cyrl.
 *
 * @param script script code
 * @return true if the script is cased
 * @stable ICU 51
 */
U_CAPI UBool U_EXPORT2
uscript_isCased(UScriptCode script);

#endif