chromium/components/url_formatter/spoof_checks/skeleton_generator.h

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
#define COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_

#include <map>
#include <memory>
#include <string>
#include <string_view>

#include "base/containers/flat_set.h"
#include "base/memory/raw_ptr.h"
#include "third_party/icu/source/common/unicode/uniset.h"

// 'icu' does not work. Use U_ICU_NAMESPACE.
namespace U_ICU_NAMESPACE {

class Transliterator;

}  // namespace U_ICU_NAMESPACE

struct USpoofChecker;

Skeletons;
SkeletonMap;

// This class generates skeleton strings from hostnames. Skeletons are a
// transformation of the input hostname. Two hostnames are confusable if their
// skeletons are identical. See http://unicode.org/reports/tr39/ for more
// information.
//
// Transformation of a hostname to its skeleton strings happens in multiple
// steps:
// 1. The hostname is "normalized" by removing its diacritics. This is done so
//    that more confusable hostnames can be detected than would be using the
//    plain ICU API.
// 2. Supplemental hostname strings are generated from the normalized hostname
//    using a manually curated "multiple skeleton" table. This table has a
//    one-to-many relationship between characters and their skeletons. The
//    number of skeletons generated by this step is capped to a maximum number.
//    This step is done before ICU's skeleton generation (which is many-to-one)
//    so that we can generate more supplemental hostnames. For example, ICU
//    maps "œ" to "oe". Since the character "œ" won't appear in the ICU
//    skeleton, we can't produce supplemental skeletons for it. Therefore, we
//    must map it to "oe" and "ce" before skeleton generation.
// 3. For each supplemental hostname, the following steps are performed:
// 4. Certain characters in the hostname are mapped to their confusable
//    equivalents using a manually curated table (extra confusible mapper). This
//    table has a many-to-one relationship between characters and their
//    skeletons. For example, the characters є, ҽ, ҿ, and ၔ are all
//    mapped to Latin lowercase e.
// 5. The hostname is passed to ICU to generate actual skeleton strings.
// 6. If the character U+04CF (ӏ) is present in the skeleton, another skeleton
//    is generated by mapping it to lowercase L (U+6C).
// 7. The final output is a Skeletons instance which contains one or more
//    skeleton strings that represent the input hostname.
class SkeletonGenerator {};

#endif  // COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_