// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This file defines a helper class for selecting a supported language from a
// set of candidates. It is used to get localized strings that are directly
// embedded into the executable / library instead of stored in external
// .pak files.
#include "base/win/embedded_i18n/language_selector.h"
#include <algorithm>
#include <functional>
#include <string_view>
#include "base/check_op.h"
#include "base/memory/raw_ptr.h"
#include "base/ranges/algorithm.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/win/i18n.h"
namespace base {
namespace win {
namespace i18n {
namespace {
using LangToOffset = LanguageSelector::LangToOffset;
// Holds pointers to LangToOffset pairs for specific languages that are the
// targets of exceptions (where one language is mapped to another) or wildcards
// (where a raw language identifier is mapped to a specific localization).
struct AvailableLanguageAliases {
raw_ptr<const LangToOffset> en_gb_language_offset;
raw_ptr<const LangToOffset> en_us_language_offset;
raw_ptr<const LangToOffset> es_language_offset;
raw_ptr<const LangToOffset> es_419_language_offset;
raw_ptr<const LangToOffset> fil_language_offset;
raw_ptr<const LangToOffset> iw_language_offset;
raw_ptr<const LangToOffset> no_language_offset;
raw_ptr<const LangToOffset> pt_br_language_offset;
raw_ptr<const LangToOffset> zh_cn_language_offset;
raw_ptr<const LangToOffset> zh_tw_language_offset;
};
#if DCHECK_IS_ON()
// Returns true if the items in the given range are sorted and lower cased.
bool IsArraySortedAndLowerCased(span<const LangToOffset> languages_to_offset) {
return std::is_sorted(languages_to_offset.begin(),
languages_to_offset.end()) &&
base::ranges::all_of(languages_to_offset, [](const auto& lang) {
auto language = AsStringPiece16(lang.first);
return ToLowerASCII(language) == language;
});
}
#endif // DCHECK_IS_ON()
// Determines the availability of all languages that may be used as aliases in
// GetAliasedLanguageOffset or GetCompatibleNeutralLanguageOffset
AvailableLanguageAliases DetermineAvailableAliases(
span<const LangToOffset> languages_to_offset) {
AvailableLanguageAliases available_aliases = {};
for (const LangToOffset& lang_to_offset : languages_to_offset) {
if (lang_to_offset.first == L"en-gb")
available_aliases.en_gb_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"en-us")
available_aliases.en_us_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"es")
available_aliases.es_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"es-419")
available_aliases.es_419_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"fil")
available_aliases.fil_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"iw")
available_aliases.iw_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"no")
available_aliases.no_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"pt-br")
available_aliases.pt_br_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"zh-cn")
available_aliases.zh_cn_language_offset = &lang_to_offset;
else if (lang_to_offset.first == L"zh-tw")
available_aliases.zh_tw_language_offset = &lang_to_offset;
}
// Fallback language must exist.
DCHECK(available_aliases.en_us_language_offset);
return available_aliases;
}
// Returns true if a LangToOffset entry can be found in |languages_to_offset|
// that matches the |language| exactly. |offset| will store the offset of the
// language that matches if any. |languages_to_offset| must be sorted by
// language and all languages must lower case.
bool GetExactLanguageOffset(span<const LangToOffset> languages_to_offset,
const std::wstring& language,
const LangToOffset** matched_language_to_offset) {
DCHECK(matched_language_to_offset);
// Binary search in the sorted arrays to find the offset corresponding
// to a given language |name|.
auto search_result = std::lower_bound(
languages_to_offset.begin(), languages_to_offset.end(), language,
[](const LangToOffset& left, const std::wstring& to_find) {
return left.first < to_find;
});
if (languages_to_offset.end() != search_result &&
search_result->first == language) {
*matched_language_to_offset = &*search_result;
return true;
}
return false;
}
// Returns true if the current language can be aliased to another language.
bool GetAliasedLanguageOffset(const AvailableLanguageAliases& available_aliases,
const std::wstring& language,
const LangToOffset** matched_language_to_offset) {
DCHECK(matched_language_to_offset);
// Alias some English variants to British English (all others wildcard to
// US).
if (available_aliases.en_gb_language_offset &&
(language == L"en-au" || language == L"en-ca" || language == L"en-nz" ||
language == L"en-za")) {
*matched_language_to_offset = available_aliases.en_gb_language_offset;
return true;
}
// Alias es-es to es (all others wildcard to es-419).
if (available_aliases.es_language_offset && language == L"es-es") {
*matched_language_to_offset = available_aliases.es_language_offset;
return true;
}
// Google web properties use iw for he. Handle both just to be safe.
if (available_aliases.iw_language_offset && language == L"he") {
*matched_language_to_offset = available_aliases.iw_language_offset;
return true;
}
// Google web properties use no for nb. Handle both just to be safe.
if (available_aliases.no_language_offset && language == L"nb") {
*matched_language_to_offset = available_aliases.no_language_offset;
return true;
}
// Some Google web properties use tl for fil. Handle both just to be safe.
// They're not completely identical, but alias it here.
if (available_aliases.fil_language_offset && language == L"tl") {
*matched_language_to_offset = available_aliases.fil_language_offset;
return true;
}
if (available_aliases.zh_cn_language_offset &&
// Pre-Vista alias for Chinese w/ script subtag.
(language == L"zh-chs" ||
// Vista+ alias for Chinese w/ script subtag.
language == L"zh-hans" ||
// Although the wildcard entry for zh would result in this, alias zh-sg
// so that it will win if it precedes another valid tag in a list of
// candidates.
language == L"zh-sg")) {
*matched_language_to_offset = available_aliases.zh_cn_language_offset;
return true;
}
if (available_aliases.zh_tw_language_offset &&
// Pre-Vista alias for Chinese w/ script subtag.
(language == L"zh-cht" ||
// Vista+ alias for Chinese w/ script subtag.
language == L"zh-hant" ||
// Alias Hong Kong and Macau to Taiwan.
language == L"zh-hk" || language == L"zh-mo")) {
*matched_language_to_offset = available_aliases.zh_tw_language_offset;
return true;
}
return false;
}
// Returns true if the current neutral language can be aliased to another
// language.
bool GetCompatibleNeutralLanguageOffset(
const AvailableLanguageAliases& available_aliases,
const std::wstring& neutral_language,
const LangToOffset** matched_language_to_offset) {
DCHECK(matched_language_to_offset);
if (available_aliases.en_us_language_offset && neutral_language == L"en") {
// Use the U.S. region for anything English.
*matched_language_to_offset = available_aliases.en_us_language_offset;
return true;
}
if (available_aliases.es_419_language_offset && neutral_language == L"es") {
// Use the Latin American region for anything Spanish.
*matched_language_to_offset = available_aliases.es_419_language_offset;
return true;
}
if (available_aliases.pt_br_language_offset && neutral_language == L"pt") {
// Use the Brazil region for anything Portugese.
*matched_language_to_offset = available_aliases.pt_br_language_offset;
return true;
}
if (available_aliases.zh_cn_language_offset && neutral_language == L"zh") {
// Use the P.R.C. region for anything Chinese.
*matched_language_to_offset = available_aliases.zh_cn_language_offset;
return true;
}
return false;
}
// Runs through the set of candidates, sending their downcased representation
// through |select_predicate|. Returns true if the predicate selects a
// candidate, in which case |matched_name| is assigned the value of the
// candidate and |matched_offset| is assigned the language offset of the
// selected translation.
// static
bool SelectIf(const std::vector<std::wstring>& candidates,
span<const LangToOffset> languages_to_offset,
const AvailableLanguageAliases& available_aliases,
const LangToOffset** matched_language_to_offset,
std::wstring* matched_name) {
DCHECK(matched_language_to_offset);
DCHECK(matched_name);
// Note: always perform the exact match first so that an alias is never
// selected in place of a future translation.
// An earlier candidate entry matching on an exact match or alias match takes
// precedence over a later candidate entry matching on an exact match.
for (const std::wstring& scan : candidates) {
std::wstring lower_case_candidate =
AsWString(ToLowerASCII(AsStringPiece16(scan)));
if (GetExactLanguageOffset(languages_to_offset, lower_case_candidate,
matched_language_to_offset) ||
GetAliasedLanguageOffset(available_aliases, lower_case_candidate,
matched_language_to_offset)) {
matched_name->assign(scan);
return true;
}
}
// If no candidate matches exactly or by alias, try to match by locale neutral
// language.
for (const std::wstring& scan : candidates) {
std::wstring lower_case_candidate =
AsWString(ToLowerASCII(AsStringPiece16(scan)));
// Extract the locale neutral language from the language to search and try
// to find an exact match for that language in the provided table.
std::wstring neutral_language =
lower_case_candidate.substr(0, lower_case_candidate.find(L'-'));
if (GetCompatibleNeutralLanguageOffset(available_aliases, neutral_language,
matched_language_to_offset)) {
matched_name->assign(scan);
return true;
}
}
return false;
}
void SelectLanguageMatchingCandidate(
const std::vector<std::wstring>& candidates,
span<const LangToOffset> languages_to_offset,
size_t* selected_offset,
std::wstring* matched_candidate,
std::wstring* selected_language) {
DCHECK(selected_offset);
DCHECK(matched_candidate);
DCHECK(selected_language);
DCHECK(!languages_to_offset.empty());
DCHECK_EQ(static_cast<size_t>(*selected_offset), languages_to_offset.size());
DCHECK(matched_candidate->empty());
DCHECK(selected_language->empty());
// Note: While DCHECK_IS_ON() seems redundant here, this is required to avoid
// compilation errors, since IsArraySortedAndLowerCased is not defined
// otherwise.
#if DCHECK_IS_ON()
DCHECK(IsArraySortedAndLowerCased(languages_to_offset))
<< "languages_to_offset is not sorted and lower cased";
#endif // DCHECK_IS_ON()
// Get which languages that are commonly used as aliases and wildcards are
// available for use to match candidates.
AvailableLanguageAliases available_aliases =
DetermineAvailableAliases(languages_to_offset);
// The fallback must exist.
DCHECK(available_aliases.en_us_language_offset);
// Try to find the first matching candidate from all the language mappings
// that are given. Failing that, used en-us as the fallback language.
const LangToOffset* matched_language_to_offset = nullptr;
if (!SelectIf(candidates, languages_to_offset, available_aliases,
&matched_language_to_offset, matched_candidate)) {
matched_language_to_offset = available_aliases.en_us_language_offset;
*matched_candidate =
std::wstring(available_aliases.en_us_language_offset->first);
}
DCHECK(matched_language_to_offset);
// Get the real language being used for the matched candidate.
*selected_language = std::wstring(matched_language_to_offset->first);
*selected_offset = matched_language_to_offset->second;
}
std::vector<std::wstring> GetCandidatesFromSystem(
std::wstring_view preferred_language) {
std::vector<std::wstring> candidates;
// Get the initial candidate list for this particular implementation (if
// applicable).
if (!preferred_language.empty())
candidates.emplace_back(preferred_language);
// Now try the UI languages. Use the thread preferred ones since that will
// kindly return us a list of all kinds of fallbacks.
win::i18n::GetThreadPreferredUILanguageList(&candidates);
return candidates;
}
} // namespace
LanguageSelector::LanguageSelector(std::wstring_view preferred_language,
span<const LangToOffset> languages_to_offset)
: LanguageSelector(GetCandidatesFromSystem(preferred_language),
languages_to_offset) {}
LanguageSelector::LanguageSelector(const std::vector<std::wstring>& candidates,
span<const LangToOffset> languages_to_offset)
: selected_offset_(languages_to_offset.size()) {
SelectLanguageMatchingCandidate(candidates, languages_to_offset,
&selected_offset_, &matched_candidate_,
&selected_language_);
}
LanguageSelector::~LanguageSelector() = default;
} // namespace i18n
} // namespace win
} // namespace base