lookalike_url_util.cc | Explore in Territory

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "components/lookalikes/core/lookalike_url_util.h"

#include <algorithm>
#include <string_view>
#include <utility>

#include "base/containers/contains.h"
#include "base/functional/callback.h"
#include "base/hash/sha1.h"
#include "base/i18n/char_iterator.h"
#include "base/metrics/histogram_macros.h"
#include "base/strings/strcat.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/trace_event/trace_event.h"
#include "base/values.h"
#include "build/build_config.h"
#include "components/lookalikes/core/safety_tips_config.h"
#include "components/security_interstitials/core/pref_names.h"
#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
#include "components/url_formatter/spoof_checks/top_domains/top_bucket_domains.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/url_util.h"

ComboSquattingParams;
DomainInfo;
GetDomainInfo;
HasOneCharacterSwap;
IsEditDistanceAtMostOne;
LookalikeTargetAllowlistChecker;
LookalikeUrlMatchType;
NavigationSuggestionEvent;
TopBucketDomainsParams;

namespace {

// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
const char kDigitChars[] = …;

// Minimum length of e2LD protected against target embedding. For example,
// foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since
// "baz" is shorter than kMinTargetE2LDLength.
const size_t kMinE2LDLengthForTargetEmbedding = …;

// We might not protect a domain whose e2LD is a common word in target embedding
// based on the TLD that is paired with it. This list supplements words from
// url_formatter::common_words::IsCommonWord().
const char* kLocalAdditionalCommonWords[] = …;

// These domains are plausible lookalike targets, but they also use common words
// in their names. Selectively prevent flagging embeddings where the embedder
// ends in "-DOMAIN.TLD", since these tend to have higher false positive rates.
const char* kDomainsPermittedInEndEmbeddings[] = …;

// What separators can be used to separate tokens in target embedding spoofs?
// e.g. www-google.com.example.com uses "-" (www-google) and "." (google.com).
const char kTargetEmbeddingSeparators[] = …;

// A small subset of private registries on the PSL that act like public
// registries AND are a common source of false positives in lookalike checks. We
// treat them as public for the purposes of lookalike checks.
const char* kPrivateRegistriesTreatedAsPublic[] = …;

TopBucketDomainsParams* GetTopDomainParams() { … }

// Minimum length of the eTLD+1 without registry needed to show the punycode
// interstitial. IDN whose eTLD+1 without registry is shorter than this are
// still displayed in punycode, but don't show an interstitial.
const size_t kMinimumE2LDLengthToShowPunycodeInterstitial = …;

// Default launch percentage of a new heuristic on Canary/Dev and Beta. These
// are used if there is a launch config for the heuristic in the proto.
const int kDefaultLaunchPercentageOnCanaryDev = …;
const int kDefaultLaunchPercentageOnBeta = …;

// Define skeletons of brand names and popular keywords for using in Combo
// Squatting heuristic. These lists are manually curated using Chrome metrics.
// We will check combinations of brand names and popular keywords.
// e. g. google-login.com or youtubesecure.com.
// For every brand name, brand_name[.]com should be checked to be valid. If
// no matched domain is found in top domains, brand_name[.]com will be
// suggested to the user for navigation.
// If brand_name[.]com is not valid for any brand name, each brand name should
// be mapped to a valid url manually and the data structure of
//  ForCSQ should be changed accordingly.
// In each element of kBrandNamesForCSQ, first string is an original brand name
// and second string is its skeleton.
// If you are adding a brand name here, you can generate its skeleton using the
// format_url binary (components/url_formatter/tools/format_url.cc)
// TODO(crbug.com/40855941): Generate skeletons of hard coded brand names in
// Chrome initialization and remove manual adding of skeletons to this list.
constexpr std::pair<const char*, const char*> kBrandNamesForCSQ[] = …;

// Each element in kSkeletonsOfPopularKeywordsForCSQ is a skeleton of a popular
// keyword. In contrast to kBrandNamesForCSQ, the original keywords are not
// included. Because in kBrandNamesForCSQ, original brand names are used to
// generate the matched domain, and original keywords are not needed for that
// process.
// If you are adding a keyword here, you can generate its skeleton
// using the format_url binary (components/url_formatter/tools/format_url.cc)
const char* kSkeletonsOfPopularKeywordsForCSQ[] = …;

// Minimum length of brand to be checked for Combo Squatting.
const size_t kMinBrandNameLengthForComboSquatting = …;

ComboSquattingParams* GetComboSquattingParams() { … }

bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
                    const url_formatter::Skeletons& skeletons2) { … }

// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
    const std::vector<DomainInfo>& engaged_sites,
    const DomainInfo& navigated_domain) { … }

// Scans the top sites list and returns true if it finds a domain with an edit
// distance or character swap of one to |domain_and_registry|. This search is
// done in lexicographic order on the top 500 suitable domains, instead of in
// order by popularity. This means that the resulting "similar" domain may not
// be the most popular domain that matches.
bool GetSimilarDomainFromTopBucket(
    const DomainInfo& navigated_domain,
    const LookalikeTargetAllowlistChecker& target_allowlisted,
    std::string* matched_domain,
    LookalikeUrlMatchType* match_type) { … }

// Scans the engaged site list for edit distance and character swap matches.
// Returns true if there is a match and fills |matched_domain| with the first
// matching engaged domain and |match_type| with the matching heuristic type.
bool GetSimilarDomainFromEngagedSites(
    const DomainInfo& navigated_domain,
    const std::vector<DomainInfo>& engaged_sites,
    const LookalikeTargetAllowlistChecker& target_allowlisted,
    std::string* matched_domain,
    LookalikeUrlMatchType* match_type) { … }

void RecordEvent(NavigationSuggestionEvent event) { … }

// Returns the parts of the domain that are separated by "." or "-", not
// including the eTLD.
//
// |hostname| must outlive the return value since the vector contains
// StringPieces.
std::vector<std::string_view> SplitDomainIntoTokens(
    const std::string& hostname) { … }

// Returns whether any subdomain ending in the last entry of |domain_labels| is
// allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the
// allowlist for google.com, scholar.google.com, and foo.scholar.google.com.
bool ASubdomainIsAllowlisted(
    const base::span<const std::string_view>& domain_labels,
    const LookalikeTargetAllowlistChecker& in_target_allowlist) { … }

// Returns the top domain if the top domain without its separators matches the
// |potential_target| (e.g. googlecom). The matching is a skeleton matching.
std::string GetMatchingTopDomainWithoutSeparators(
    std::string_view potential_target) { … }

// Returns whether the visited domain is either for a bare eTLD+1 (e.g.
// 'google.com') or a trivial subdomain (e.g. 'www.google.com').
bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) { … }

// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
// site or a top bucket domain. |embedded_target| is set to matching eTLD+1.
bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
    const DomainInfo& domain,
    const std::vector<DomainInfo>& engaged_sites,
    std::string* embedded_target) { … }

// Returns whether the e2LD of the provided domain is a common word (e.g.
// weather.com, ask.com). Target embeddings of these domains are often false
// positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto,
                    const DomainInfo& domain) { … }

// Returns whether |domain_labels| is in the same domain as embedding_domain.
// e.g. IsEmbeddingItself(["foo", "example", "com"], "example.com") -> true
//  since foo.example.com is in the same domain as example.com.
bool IsEmbeddingItself(const base::span<const std::string_view>& domain_labels,
                       const std::string& embedding_domain) { … }

// Identical to url_formatter::top_domains::HostnameWithoutRegistry(), but
// respects de-facto public registries like .com.de using similar logic to
// GetETLDPlusOne. See kPrivateRegistriesTreatedAsPublic definition for more
// details. e.g. "google.com.de" returns "google". Call with an eTLD+1, not a
// full hostname.
std::string GetE2LDWithDeFactoPublicRegistries(
    const std::string& domain_and_registry) { … }

// Returns whether |embedded_target| and |embedding_domain| share the same e2LD,
// (as in, e.g., google.com and google.org, or airbnb.com.br and airbnb.com).
// Assumes |embedding_domain| is an eTLD+1. Respects de-facto public eTLDs.
bool IsCrossTLDMatch(const DomainInfo& embedded_target,
                     const std::string& embedding_domain) { … }

// Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings
// and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is
// permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings.  Only
// impacts Target Embedding matches.
bool EndsWithPermittedDomains(const DomainInfo& embedded_target,
                              const std::string& embedding_domain) { … }

// A domain is allowed to be embedded if is embedding itself, if its e2LD is a
// common word, any valid partial subdomain is allowlisted, or if it's a
// cross-TLD match (e.g. google.com vs google.com.mx).
bool IsAllowedToBeEmbedded(
    const DomainInfo& embedded_target,
    const base::span<const std::string_view>& subdomain_span,
    const LookalikeTargetAllowlistChecker& in_target_allowlist,
    const std::string& embedding_domain,
    const reputation::SafetyTipsConfig* config_proto) { … }

// Returns the first character of the first string that is different from the
// second string. Strings should be at least 1 edit distance apart.
char GetFirstDifferentChar(const std::string& str1, const std::string& str2) { … }

// Brand names with length of 4 or less should not be checked in domains for
// Combo Squatting. Short brand names can cause false positives in results.
bool IsComboSquattingCandidate(const std::string& brand) { … }

// Extract brand names from engaged sites to be checked for Combo Squatting, if
// the brand is not one of the hard coded brand names.
std::vector<std::pair<std::string, std::string>> GetBrandNamesFromEngagedSites(
    const std::vector<DomainInfo>& engaged_sites) { … }

// Registry of the navigated domain is needed to find matched_domain
// in Combo Squatting domains. For example, registry of
// `google-login[.]co[.]br` is `co[.]br`.
std::string GetRegistry(const DomainInfo& navigated_domain) { … }

// If a matched domain including the brand name and TLD of
// navigated domain is found in top domains, |matched_domain|
// is set to the found top domain. Otherwise, |matched_domain| will
// be set to brand_name[.]com. Hard coded brand names should be checked to have
// valid brand_name[.]com url.
std::string FindMatchedDomainForHardCodedComboSquatting(
    const std::string& brand_name,
    const DomainInfo& navigated_domain) { … }

// Engaged sites are sorted based on engagement score, so |matched_domain|
// will be set to the first domain in the engaged sites lists that includes
// the brand name of the navigated domain.
std::string FindMatchedDomainForSiteEngagementComboSquatting(
    const std::string& brand_name,
    const DomainInfo& navigated_domain,
    const std::vector<DomainInfo>& engaged_sites) { … }

// Returns true if the navigated_domain is flagged as Combo Squatting.
// matched_domain is the suggested domain that will be shown to the user
// instead of the navigated_domain in the warning UI.
bool IsComboSquatting(
    const std::vector<std::pair<std::string, std::string>>& brand_names,
    const ComboSquattingParams& combo_squatting_params,
    const DomainInfo& navigated_domain,
    const std::vector<DomainInfo>& engaged_sites,
    std::string* matched_domain,
    bool is_hard_coded) { … }

}  // namespace

namespace lookalikes {

const char kInterstitialHistogramName[] = …;

void RegisterProfilePrefs(user_prefs::PrefRegistrySyncable* registry) { … }

std::string GetConsoleMessage(const GURL& lookalike_url,
                              bool is_new_heuristic) { … }

DomainInfo::DomainInfo(
    const std::string& arg_hostname,
    const std::string& arg_domain_and_registry,
    const std::string& arg_domain_without_registry,
    const url_formatter::IDNConversionResult& arg_idn_result,
    const url_formatter::Skeletons& arg_skeletons,
    const url_formatter::Skeletons& arg_domain_without_registry_skeletons)
    : … { … }

DomainInfo::~DomainInfo() = default;

DomainInfo::DomainInfo(const DomainInfo&) = default;

DomainInfo GetDomainInfo(const std::string& hostname) { … }

DomainInfo GetDomainInfo(const GURL& url) { … }

std::string GetETLDPlusOne(const std::string& hostname) { … }

bool IsEditDistanceAtMostOne(const std::u16string& str1,
                             const std::u16string& str2) { … }

bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
                                       const DomainInfo& matched_domain) { … }

bool IsLikelyCharacterSwapFalsePositive(const DomainInfo& navigated_domain,
                                        const DomainInfo& matched_domain) { … }

bool IsTopDomain(const DomainInfo& domain_info) { … }

bool GetMatchingDomain(
    const DomainInfo& navigated_domain,
    const std::vector<DomainInfo>& engaged_sites,
    const LookalikeTargetAllowlistChecker& in_target_allowlist,
    const reputation::SafetyTipsConfig* config_proto,
    std::string* matched_domain,
    LookalikeUrlMatchType* match_type) { … }

void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) { … }

TargetEmbeddingType GetTargetEmbeddingType(
    const std::string& hostname,
    const std::vector<DomainInfo>& engaged_sites,
    const LookalikeTargetAllowlistChecker& in_target_allowlist,
    const reputation::SafetyTipsConfig* config_proto,
    std::string* safe_hostname) { … }

TargetEmbeddingType SearchForEmbeddings(
    const std::string& hostname,
    const std::vector<DomainInfo>& engaged_sites,
    const LookalikeTargetAllowlistChecker& in_target_allowlist,
    const reputation::SafetyTipsConfig* config_proto,
    bool safety_tips_allowed,
    std::string* safe_hostname) { … }

bool IsASCII(UChar32 codepoint) { … }

// Returns true if |codepoint| has emoji related properties.
bool IsEmojiRelatedCodepoint(UChar32 codepoint) { … }

// Returns true if |text| contains only ASCII characters, pictographs
// or emojis. This check is only used to determine if a domain that already
// failed spoof checks should be blocked by an interstitial. Ideally, we would
// check this for non-ASCII scripts as well (e.g. Cyrillic + emoji), but such
// usage isn't common.
bool IsASCIIAndEmojiOnly(std::u16string_view text) { … }

// Returns true if the e2LD of domain is long enough to display a punycode
// interstitial.
bool IsPunycodeInterstitialCandidate(const DomainInfo& domain) { … }

bool ShouldBlockBySpoofCheckResult(const DomainInfo& navigated_domain) { … }

bool IsAllowedByEnterprisePolicy(const PrefService* pref_service,
                                 const GURL& url) { … }

void SetEnterpriseAllowlistForTesting(PrefService* pref_service,
                                      const std::vector<std::string>& hosts) { … }

bool HasOneCharacterSwap(const std::u16string& str1,
                         const std::u16string& str2) { … }

void SetTopBucketDomainsParamsForTesting(const TopBucketDomainsParams& params) { … }

void ResetTopBucketDomainsParamsForTesting() { … }

bool IsHeuristicEnabledForHostname(
    const reputation::SafetyTipsConfig* config_proto,
    const reputation::HeuristicLaunchConfig::Heuristic heuristic,
    const std::string& lookalike_etld_plus_one,
    version_info::Channel channel) { … }

void SetComboSquattingParamsForTesting(const ComboSquattingParams& params) { … }

void ResetComboSquattingParamsForTesting() { … }

ComboSquattingType GetComboSquattingType(
    const DomainInfo& navigated_domain,
    const std::vector<DomainInfo>& engaged_sites,
    std::string* matched_domain) { … }

bool IsSafeTLD(const std::string& hostname) { … }

LookalikeActionType GetActionForMatchType(
    const reputation::SafetyTipsConfig* config,
    version_info::Channel channel,
    const std::string& etld_plus_one,
    LookalikeUrlMatchType match_type) { … }

GURL GetSuggestedURL(LookalikeUrlMatchType match_type,
                     const GURL& navigated_url,
                     const std::string& matched_hostname) { … }

}  // namespace lookalikes
chromium/components/lookalikes/core/lookalike_url_util.cc