chromium/components/url_formatter/spoof_checks/top_domains/make_top_domain_list_variables.cc

// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

// This binary generates two C arrays of useful information related to top
// domains, which we embed directly into
// the final Chrome binary.  The input is a list of the top domains. The first
// output is named kTopBucketEditDistanceSkeletons,
// containing the skeletons of the top bucket domains suitable for use in the
// edit distance heuristic. The second output is named kTopKeywords,
// containing the top bucket keywords suitable for use with the keyword matching
// heuristic (for instance, www.google.com -> google). Both outputs are written
// to the same file, which will be formatted as c++ source file with valid
// syntax.

// The C-strings in both of the output arrays are guaranteed to be in
// lexicographically sorted order.

// IMPORTANT: This binary asserts that there are at least enough sites in the
// input file to generate 500 skeletons and 500 keywords.

#include <iostream>
#include <set>
#include <sstream>
#include <string>
#include <vector>

#include "base/command_line.h"
#include "base/files/file_util.h"
#include "base/i18n/icu_util.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/ranges/algorithm.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"

namespace {

// The size of the top domain array generated in top-bucket-domains-inc.cc. Must
// match that in top_bucket_domains.h. If the file has fewer than kMaxDomains
// eligible top bucket domains marked (e.g. because some are too short), the
// generated array may be padded with blank entries up to kMaxDomains.
const size_t kMaxDomains =;
const char* kTopBucketSeparator =;

// Similar to kMaxDomains, but for kTopKeywords. Unlike the top domain array,
// this array is a fixed length, and we also output a kNumTopKeywords variable.
const size_t kMaxKeywords =;

// The minimum length for a keyword for it to be included.
const size_t kMinKeywordLength =;

void PrintHelp() {}

std::string GetSkeleton(const std::string& domain,
                        const USpoofChecker* spoof_checker) {}

bool ContainsOnlyDigits(const std::string& text) {}

}  // namespace

int main(int argc, char* argv[]) {}