chromium/components/lookalikes/core/safety_tips.proto

// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

syntax = "proto2";

option optimize_for = LITE_RUNTIME;

package reputation;

message FlaggedPage {
  enum FlagType {
    UNKNOWN = 0;
    BAD_REP = 1;
    YOUNG_DOMAIN = 2;
  }
  // |pattern| is a full URL, without scheme/username/password/port, such as
  // example.test/test-path-for-safety-tips/test.html.
  optional string pattern = 1;
  optional FlagType type = 2;
}

message UrlPattern {
  // |pattern| is a full URL, without scheme/username/password/port, such as
  // example.test/test-path-for-safety-tips/test.html. Also see the comment for
  // |allowed_pattern| field.
  optional string pattern = 1;

  // The index of any cohort[s] that this entry is allowed to spoof. If this
  // field is unset and the pattern is allowlisted, the pattern may spoof any
  // domain. Has no meaning when used in canonical_pattern.
  repeated uint32 cohort_index = 2 [packed = true];
}

message HostPattern {
  // |regex| is a regular expression that matches allowlisted hostnames.
  // This is different than UrlPattern.pattern which matches URLs.
  // IMPORTANT: Don't forget to escape dots when used as label separators.
  optional string regex = 1;
}

message HeuristicLaunchConfig {
  // The heuristic to be launched with a warning UI.
  // Important: Changes to heuristics MUST be added as a NEW heuristic (e.g.
  // by adding a new enum value with "_V2" at the end).
  // Otherwise, rolling out the new version will also enable the buggy version
  // of the heuristic on older versions of Chrome.
  //
  // The values in this enum are intended to be temporary and used only for
  // new heuristic launches. Do not reuse this enum for any other purpose,
  // use the existing LookalikeHeuristic enum in this file.
  enum Heuristic {
    HEURISTIC_UNKNOWN = 0;
    HEURISTIC_CHARACTER_SWAP_ENGAGED_SITES = 1;
    HEURISTIC_CHARACTER_SWAP_TOP_SITES = 2;
    HEURISTIC_COMBO_SQUATTING_TOP_DOMAINS = 3;
    HEURISTIC_COMBO_SQUATTING_ENGAGED_SITES = 4;
  }
  optional Heuristic heuristic = 1;

  // Percentage of all sites this heuristic should be enabled on the Stable
  // Channel. The determination of which sites are enabled is based on hash
  // prefixes of the sites.
  // A value of 0 means the heuristic isn't enabled on any site on Stable.
  // A value of 100 means the heuristic is enabled on all sites on Stable.
  //
  // If a launch config is found for a heuristic, the heuristic is enabled for
  // 90% of sites on Canary/Dev and 50% on Beta, regardless of the value of this
  // field.
  optional uint32 launch_percentage = 2;
}

// A set of domains allowed to be spoofed by a given allowlist entry.
// allowed_pattern or canonical_pattern indices may appear in multiple
// Cohorts, and multiple allowed_patterns may point to the same Cohort.
message Cohort {
  // Indexes in `allowed_pattern` in this cohort.
  repeated uint32 allowed_index = 1 [packed = true];

  // Indexes in `canonical_pattern` in this cohort.
  repeated uint32 canonical_index = 2 [packed = true];
}

// Configuration for the safety tips component. A binary version of this proto
// will be distributed to Chrome clients via component updater. The binary will
// contain a single instance of this message.
message SafetyTipsConfig {
  optional uint32 version_id = 1;

  // List of pages on which to show the Safety Tip UX. This must be sorted and
  // may contain duplicate patterns (when flagged with multiple FlagTypes).
  repeated FlaggedPage flagged_page = 2;

  // List of patterns that are explicitly allowed. This must be sorted. Used to
  // mitigate false positives in Safety Tips and Lookalike warnings.
  // - For safety tips, the pattern can be a URL or a full suffix/prefix
  //   expression used for SafeBrowsing. See
  //   https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions.
  // - Lookalike warnings operate on eTLD+1, so it only makes sense for this
  //   to be eTLD+1, such as "google.com/".
  repeated UrlPattern allowed_pattern = 3;

  // Similar to allowed_pattern, but used to allowlist targets for some of the
  // heuristics.
  // - For edit distance, this is the matched domain. For example, consider edit
  //   distance flagging foo1.com, foo2.com, ... as a spoof of foo.com. If
  //   we are fairly sure that these are all separate and legitimate sites,
  //   allowlisting foo.com is much easier than allowlisting fooN.coms.
  // - For target embedding, this is the embedded target. Some organizations use
  //   lookalike subdomains to proxy popular domains or customize their content
  //   for these popular domains. E.g. google-scholar-com.university.edu.
  //   In these cases it's simpler to allowlist the target instead of the
  //   embedder.
  repeated HostPattern allowed_target_pattern = 4;

  // A *sorted* list of common words. These words are combined with the list at
  // components/url_formatter/spoof_checks/common_words. The combined list is
  // used in some lookalike heuristics to prevent common false positives.
  repeated string common_word = 5;

  // Launch configurations for new heuristics. Each new heuristic being launched
  // gets its own config. Multiple heuristics can be enabled at the same time.
  repeated HeuristicLaunchConfig launch_config = 6;

  // canonical_pattern is a list of hostnames that are only ever spoofed, and do
  // no spoofing of their own. Entries are pointed to by one or more Cohort.
  repeated UrlPattern canonical_pattern = 7;

  // A Cohort is a set of domains that may be spoofed by the allowed_pattern
  // that points to it.
  repeated Cohort cohort = 8;
}