uloc.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 1997-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* File ULOC.CPP
*
* Modification History:
*
*   Date        Name        Description
*   04/01/97    aliu        Creation.
*   08/21/98    stephen     JDK 1.2 sync
*   12/08/98    rtg         New Locale implementation and C API
*   03/15/99    damiba      overhaul.
*   04/06/99    stephen     changed setDefault() to realloc and copy
*   06/14/99    stephen     Changed calls to ures_open for new params
*   07/21/99    stephen     Modified setDefault() to propagate to C++
*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
*                           brought canonicalization code into line with spec
*****************************************************************************/

/*
   POSIX's locale format, from putil.c: [no spaces]

     ll [ _CC ] [ . MM ] [ @ VV]

     l = lang, C = ctry, M = charmap, V = variant
*/

#include <algorithm>
#include <optional>
#include <string_view>

#include "unicode/bytestream.h"
#include "unicode/errorcode.h"
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/uloc.h"

#include "bytesinkutil.h"
#include "putilimp.h"
#include "ustr_imp.h"
#include "ulocimp.h"
#include "umutex.h"
#include "cstring.h"
#include "cmemory.h"
#include "locmap.h"
#include "uarrsort.h"
#include "uenumimp.h"
#include "uassert.h"
#include "charstr.h"

U_NAMESPACE_USE

/* ### Declarations **************************************************/

/* Locale stuff from locid.cpp */
U_CFUNC void locale_set_default(const char *id);
U_CFUNC const char *locale_get_default();

namespace {

/* ### Data tables **************************************************/

/**
 * Table of language codes, both 2- and 3-letter, with preference
 * given to 2-letter codes where possible.  Includes 3-letter codes
 * that lack a 2-letter equivalent.
 *
 * This list must be in sorted order.  This list is returned directly
 * to the user by some API.
 *
 * This list must be kept in sync with LANGUAGES_3, with corresponding
 * entries matched.
 *
 * This table should be terminated with a nullptr entry, followed by a
 * second list, and another nullptr entry.  The first list is visible to
 * user code when this array is returned by API.  The second list
 * contains codes we support, but do not expose through user API.
 *
 * Notes
 *
 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
 * include the revisions up to 2001/7/27 *CWB*
 *
 * The 3 character codes are the terminology codes like RFC 3066.  This
 * is compatible with prior ICU codes
 *
 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
 * table but now at the end of the table because 3 character codes are
 * duplicates.  This avoids bad searches going from 3 to 2 character
 * codes.
 *
 * The range qaa-qtz is reserved for local use
 */
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
/* ISO639 table version is 20150505 */
/* Subsequent hand addition of selected languages */
constexpr const char* LANGUAGES[] = …;

constexpr const char* DEPRECATED_LANGUAGES[]= …;
constexpr const char* REPLACEMENT_LANGUAGES[]= …;

/**
 * Table of 3-letter language codes.
 *
 * This is a lookup table used to convert 3-letter language codes to
 * their 2-letter equivalent, where possible.  It must be kept in sync
 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 * same language as LANGUAGES_3[i].  The commented-out lines are
 * copied from LANGUAGES to make eyeballing this baby easier.
 *
 * Where a 3-letter language code has no 2-letter equivalent, the
 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 *
 * This table should be terminated with a nullptr entry, followed by a
 * second list, and another nullptr entry.  The two lists correspond to
 * the two lists in LANGUAGES.
 */
/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
/* ISO639 table version is 20150505 */
/* Subsequent hand addition of selected languages */
constexpr const char* LANGUAGES_3[] = …;

/**
 * Table of 2-letter country codes.
 *
 * This list must be in sorted order.  This list is returned directly
 * to the user by some API.
 *
 * This list must be kept in sync with COUNTRIES_3, with corresponding
 * entries matched.
 *
 * This table should be terminated with a nullptr entry, followed by a
 * second list, and another nullptr entry.  The first list is visible to
 * user code when this array is returned by API.  The second list
 * contains codes we support, but do not expose through user API.
 *
 * Notes:
 *
 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 * new codes keeping the old ones for compatibility updated to include
 * 1999/12/03 revisions *CWB*
 *
 * RO(ROM) is now RO(ROU) according to
 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 */
constexpr const char* COUNTRIES[] = …;

constexpr const char* DEPRECATED_COUNTRIES[] = …;
constexpr const char* REPLACEMENT_COUNTRIES[] = …;

/**
 * Table of 3-letter country codes.
 *
 * This is a lookup table used to convert 3-letter country codes to
 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 * For all valid i, COUNTRIES[i] must refer to the same country as
 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 * to make eyeballing this baby easier.
 *
 * This table should be terminated with a nullptr entry, followed by a
 * second list, and another nullptr entry.  The two lists correspond to
 * the two lists in COUNTRIES.
 */
constexpr const char* COUNTRIES_3[] = …;

CanonicalizationMap;

/**
 * A map to canonicalize locale IDs.  This handles a variety of
 * different semantic kinds of transformations.
 */
constexpr CanonicalizationMap CANONICALIZE_MAP[] = …;

/* ### BCP47 Conversion *******************************************/
/* Gets the size of the shortest subtag in the given localeID. */
int32_t getShortestSubtagLength(const char *localeID) { … }
/* Test if the locale id has BCP47 u extension and does not have '@' */
inline bool _hasBCP47Extension(const char *id) { … }

/* ### Keywords **************************************************/
inline bool UPRV_ISDIGIT(char c) { … }
inline bool UPRV_ISALPHANUM(char c) { … }
/* Punctuation/symbols allowed in legacy key values */
inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { … }

}  // namespace

#define ULOC_KEYWORD_BUFFER_LEN …
#define ULOC_MAX_NO_KEYWORDS …

U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char *localeID) { … }

namespace {

/**
 * @param keywordName incoming name to be canonicalized
 * @param status return status (keyword too long)
 * @return the keyword name
 */
CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
{ … }

KeywordStruct;

int32_t U_CALLCONV
compareKeywordStructs(const void * /*context*/, const void *left, const void *right) { … }

}  // namespace

U_EXPORT CharString
ulocimp_getKeywords(const char* localeID,
                    char prev,
                    bool valuesToo,
                    UErrorCode& status)
{ … }

U_EXPORT void
ulocimp_getKeywords(const char* localeID,
                    char prev,
                    ByteSink& sink,
                    bool valuesToo,
                    UErrorCode& status)
{ … }

U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char* localeID,
                     const char* keywordName,
                     char* buffer, int32_t bufferCapacity,
                     UErrorCode* status)
{ … }

U_EXPORT CharString
ulocimp_getKeywordValue(const char* localeID,
                        std::string_view keywordName,
                        UErrorCode& status)
{ … }

U_EXPORT void
ulocimp_getKeywordValue(const char* localeID,
                        std::string_view keywordName,
                        icu::ByteSink& sink,
                        UErrorCode& status)
{ … }

U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char* keywordName,
                     const char* keywordValue,
                     char* buffer, int32_t bufferCapacity,
                     UErrorCode* status)
{ … }

U_EXPORT void
ulocimp_setKeywordValue(std::string_view keywordName,
                        std::string_view keywordValue,
                        CharString& localeID,
                        UErrorCode& status)
{ … }

U_EXPORT int32_t
ulocimp_setKeywordValue(std::string_view keywords,
                        std::string_view keywordName,
                        std::string_view keywordValue,
                        ByteSink& sink,
                        UErrorCode& status)
{ … }

/* ### ID parsing implementation **************************************************/

namespace {

inline bool _isPrefixLetter(char a) { … }

/*returns true if one of the special prefixes is here (s=string)
  'x-' or 'i-' */
inline bool _isIDPrefix(const char *s) { … }

/* Dot terminates it because of POSIX form  where dot precedes the codepage
 * except for variant
 */
inline bool _isTerminator(char a) { … }

inline bool _isBCP47Extension(const char* p) { … }

/**
 * Lookup 'key' in the array 'list'.  The array 'list' should contain
 * a nullptr entry, followed by more entries, and a second nullptr entry.
 *
 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
 * COUNTRIES_3.
 */
std::optional<int16_t> _findIndex(const char* const* list, const char* key)
{ … }

}  // namespace

U_CFUNC const char*
uloc_getCurrentCountryID(const char* oldID){ … }
U_CFUNC const char*
uloc_getCurrentLanguageID(const char* oldID){ … }

namespace {

/*
 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
 * avoid duplicating code to handle the earlier locale ID pieces
 * in the functions for the later ones by
 * setting the *pEnd pointer to where they stopped parsing
 *
 * TODO try to use this in Locale
 */

void
_getLanguage(const char* localeID,
             ByteSink* sink,
             const char** pEnd,
             UErrorCode& status) { … }

void
_getScript(const char* localeID,
           ByteSink* sink,
           const char** pEnd) { … }

void
_getRegion(const char* localeID,
           ByteSink* sink,
           const char** pEnd) { … }

/**
 * @param needSeparator if true, then add leading '_' if any variants
 * are added to 'variant'
 */
void
_getVariant(const char* localeID,
            char prev,
            ByteSink* sink,
            const char** pEnd,
            bool needSeparator,
            UErrorCode& status) { … }

}  // namespace

U_EXPORT CharString
ulocimp_getLanguage(const char* localeID, UErrorCode& status) { … }

U_EXPORT CharString
ulocimp_getScript(const char* localeID, UErrorCode& status) { … }

U_EXPORT CharString
ulocimp_getRegion(const char* localeID, UErrorCode& status) { … }

U_EXPORT CharString
ulocimp_getVariant(const char* localeID, UErrorCode& status) { … }

U_EXPORT void
ulocimp_getSubtags(
        const char* localeID,
        CharString* language,
        CharString* script,
        CharString* region,
        CharString* variant,
        const char** pEnd,
        UErrorCode& status) { … }

U_EXPORT void
ulocimp_getSubtags(
        const char* localeID,
        ByteSink* language,
        ByteSink* script,
        ByteSink* region,
        ByteSink* variant,
        const char** pEnd,
        UErrorCode& status) { … }

/* Keyword enumeration */

UKeywordsContext;

U_CDECL_BEGIN

static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration *enumerator) { … }

static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) { … }

static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration* en,
                    int32_t* resultLength,
                    UErrorCode* /*status*/) { … }

static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration* en,
                      UErrorCode* /*status*/) { … }

U_CDECL_END


static const UEnumeration gKeywordsEnum = …;

U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
{ … }

U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char* localeID,
                        UErrorCode* status)
{ … }


/* bit-flags for 'options' parameter of _canonicalize */
#define _ULOC_STRIP_KEYWORDS …
#define _ULOC_CANONICALIZE …

namespace …  // namespace

/* ### ID parsing API **************************************************/

U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char*    localeID,
               char* parent,
               int32_t parentCapacity,
               UErrorCode* err)
{ … }

U_EXPORT CharString
ulocimp_getParent(const char* localeID,
                  UErrorCode& err)
{ … }

U_EXPORT void
ulocimp_getParent(const char* localeID,
                  icu::ByteSink& sink,
                  UErrorCode& err)
{ … }

U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char*    localeID,
         char* language,
         int32_t languageCapacity,
         UErrorCode* err)
{ … }

U_CAPI int32_t U_EXPORT2
uloc_getScript(const char*    localeID,
         char* script,
         int32_t scriptCapacity,
         UErrorCode* err)
{ … }

U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char* localeID,
            char* country,
            int32_t countryCapacity,
            UErrorCode* err)
{ … }

U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char* localeID,
                char* variant,
                int32_t variantCapacity,
                UErrorCode* err)
{ … }

U_CAPI int32_t  U_EXPORT2
uloc_getName(const char* localeID,
             char* name,
             int32_t nameCapacity,
             UErrorCode* err)
{ … }

U_EXPORT CharString
ulocimp_getName(const char* localeID,
                UErrorCode& err)
{ … }

U_EXPORT void
ulocimp_getName(const char* localeID,
                ByteSink& sink,
                UErrorCode& err)
{ … }

U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char* localeID,
                 char* name,
                 int32_t nameCapacity,
                 UErrorCode* err)
{ … }

U_EXPORT CharString
ulocimp_getBaseName(const char* localeID,
                    UErrorCode& err)
{ … }

U_EXPORT void
ulocimp_getBaseName(const char* localeID,
                    ByteSink& sink,
                    UErrorCode& err)
{ … }

U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char* localeID,
                  char* name,
                  int32_t nameCapacity,
                  UErrorCode* err)
{ … }

U_EXPORT CharString
ulocimp_canonicalize(const char* localeID,
                     UErrorCode& err)
{ … }

U_EXPORT void
ulocimp_canonicalize(const char* localeID,
                     ByteSink& sink,
                     UErrorCode& err)
{ … }

U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char* localeID)
{ … }

U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char* localeID)
{ … }

U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char* localeID)
{ … }

U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
                UErrorCode *status)
{ … }

/* ### Default locale **************************************************/

U_CAPI const char*  U_EXPORT2
uloc_getDefault()
{ … }

U_CAPI void  U_EXPORT2
uloc_setDefault(const char*   newDefaultLocale,
             UErrorCode* err)
{ … }

/**
 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
 * to an array of pointers to arrays of char.  All of these pointers are owned
 * by ICU-- do not delete them, and do not write through them.  The array is
 * terminated with a null pointer.
 */
U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()
{ … }

/**
 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
 * pointer to an array of pointers to arrays of char.  All of these pointers are
 * owned by ICU-- do not delete them, and do not write through them.  The array is
 * terminated with a null pointer.
 */
U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()
{ … }

U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char* keyword)
{ … }

U_EXPORT std::optional<std::string_view>
ulocimp_toBcpKeyWithFallback(std::string_view keyword)
{ … }

U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char* keyword, const char* value)
{ … }

U_EXPORT std::optional<std::string_view>
ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value)
{ … }

namespace {

bool
isWellFormedLegacyKey(std::string_view key)
{ … }

bool
isWellFormedLegacyType(std::string_view legacyType)
{ … }

}  // namespace

U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char* keyword)
{ … }

U_EXPORT std::optional<std::string_view>
ulocimp_toLegacyKeyWithFallback(std::string_view keyword)
{ … }

U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char* keyword, const char* value)
{ … }

U_EXPORT std::optional<std::string_view>
ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value)
{ … }

/*eof*/
godot/thirdparty/icu4c/common/uloc.cpp