// Copyright 2020 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_ #define COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_ #include <map> #include <memory> #include <string> #include <string_view> #include "base/containers/flat_set.h" #include "base/memory/raw_ptr.h" #include "third_party/icu/source/common/unicode/uniset.h" // 'icu' does not work. Use U_ICU_NAMESPACE. namespace U_ICU_NAMESPACE { class Transliterator; } // namespace U_ICU_NAMESPACE struct USpoofChecker; Skeletons; SkeletonMap; // This class generates skeleton strings from hostnames. Skeletons are a // transformation of the input hostname. Two hostnames are confusable if their // skeletons are identical. See http://unicode.org/reports/tr39/ for more // information. // // Transformation of a hostname to its skeleton strings happens in multiple // steps: // 1. The hostname is "normalized" by removing its diacritics. This is done so // that more confusable hostnames can be detected than would be using the // plain ICU API. // 2. Supplemental hostname strings are generated from the normalized hostname // using a manually curated "multiple skeleton" table. This table has a // one-to-many relationship between characters and their skeletons. The // number of skeletons generated by this step is capped to a maximum number. // This step is done before ICU's skeleton generation (which is many-to-one) // so that we can generate more supplemental hostnames. For example, ICU // maps "œ" to "oe". Since the character "œ" won't appear in the ICU // skeleton, we can't produce supplemental skeletons for it. Therefore, we // must map it to "oe" and "ce" before skeleton generation. // 3. For each supplemental hostname, the following steps are performed: // 4. Certain characters in the hostname are mapped to their confusable // equivalents using a manually curated table (extra confusible mapper). This // table has a many-to-one relationship between characters and their // skeletons. For example, the characters є, ҽ, ҿ, and ၔ are all // mapped to Latin lowercase e. // 5. The hostname is passed to ICU to generate actual skeleton strings. // 6. If the character U+04CF (ӏ) is present in the skeleton, another skeleton // is generated by mapping it to lowercase L (U+6C). // 7. The final output is a Skeletons instance which contains one or more // skeleton strings that represent the input hostname. class SkeletonGenerator { … }; #endif // COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_