// Copyright 2018 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40285824): Remove this and convert code to safer constructs. #pragma allow_unsafe_buffers #endif // This binary generates two C arrays of useful information related to top // domains, which we embed directly into // the final Chrome binary. The input is a list of the top domains. The first // output is named kTopBucketEditDistanceSkeletons, // containing the skeletons of the top bucket domains suitable for use in the // edit distance heuristic. The second output is named kTopKeywords, // containing the top bucket keywords suitable for use with the keyword matching // heuristic (for instance, www.google.com -> google). Both outputs are written // to the same file, which will be formatted as c++ source file with valid // syntax. // The C-strings in both of the output arrays are guaranteed to be in // lexicographically sorted order. // IMPORTANT: This binary asserts that there are at least enough sites in the // input file to generate 500 skeletons and 500 keywords. #include <iostream> #include <set> #include <sstream> #include <string> #include <vector> #include "base/command_line.h" #include "base/files/file_util.h" #include "base/i18n/icu_util.h" #include "base/logging.h" #include "base/path_service.h" #include "base/ranges/algorithm.h" #include "base/strings/string_split.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "build/build_config.h" #include "components/url_formatter/spoof_checks/common_words/common_words_util.h" #include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h" #include "third_party/icu/source/common/unicode/unistr.h" #include "third_party/icu/source/common/unicode/utypes.h" #include "third_party/icu/source/i18n/unicode/uspoof.h" namespace { // The size of the top domain array generated in top-bucket-domains-inc.cc. Must // match that in top_bucket_domains.h. If the file has fewer than kMaxDomains // eligible top bucket domains marked (e.g. because some are too short), the // generated array may be padded with blank entries up to kMaxDomains. const size_t kMaxDomains = …; const char* kTopBucketSeparator = …; // Similar to kMaxDomains, but for kTopKeywords. Unlike the top domain array, // this array is a fixed length, and we also output a kNumTopKeywords variable. const size_t kMaxKeywords = …; // The minimum length for a keyword for it to be included. const size_t kMinKeywordLength = …; void PrintHelp() { … } std::string GetSkeleton(const std::string& domain, const USpoofChecker* spoof_checker) { … } bool ContainsOnlyDigits(const std::string& text) { … } } // namespace int main(int argc, char* argv[]) { … }