url_pattern_index.fbs | Explore in Territory

// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

namespace url_pattern_index.flat;

// NOTE: Increment url_pattern_index::kUrlPatternIndexFormatVersion whenever
// making a breaking change to this schema.

// Corresponds to url_pattern_index::proto::UrlPatternType.
enum UrlPatternType : ubyte {
  SUBSTRING,
  WILDCARDED,
  REGEXP,
}

// Corresponds to url_pattern_index::proto::AnchorType.
enum AnchorType : ubyte {
  NONE,
  BOUNDARY,
  SUBDOMAIN,
}

// URL rule matching options. These correspond to multiple fields of
// url_pattern_index::proto::UrlRule, but here, they are represented as flags
// of the same bitmask to allow for compact storage.
enum OptionFlag : ubyte (bit_flags) {
  IS_ALLOWLIST,
  APPLIES_TO_FIRST_PARTY,
  APPLIES_TO_THIRD_PARTY,
  IS_CASE_INSENSITIVE,
}

// The options controlling whether or not to activate filtering for subresources
// of documents that match the URL pattern of the rule.
// Corresponds to url_pattern_index::proto::ActivationType.
enum ActivationType : ubyte (bit_flags) {
  DOCUMENT,       // Disable all rules on the page.
  GENERIC_BLOCK,  // Disable generic URL rules on the page.
}

// The types of subresource requests that a URL rule should be applied to.
enum ElementType : ushort (bit_flags) {
  OTHER,
  SCRIPT,
  IMAGE,
  STYLESHEET,
  OBJECT,
  XMLHTTPREQUEST,
  // TODO(crbug.com/40516600): Remove OBJECT_SUBREQUEST type once
  // url_pattern_index no longer has a dependency on proto::UrlRule.
  OBJECT_SUBREQUEST,
  SUBDOCUMENT,
  PING,
  MEDIA,
  FONT,
  WEBSOCKET,
  WEBTRANSPORT,
  WEBBUNDLE,
  CSP_REPORT,
  MAIN_FRAME,
  // Note: Update the default value for |element_types| field in UrlRule, on
  // adding/removing values from this enum.
}

// The request methods that a URL rule should be applied to.
enum RequestMethod : ushort (bit_flags) {
  CONNECT,
  DELETE,
  GET,
  HEAD,
  OPTIONS,
  PATCH,
  POST,
  PUT,
  OTHER_HTTP, // Matches HTTP(s) request methods not listed above.
  NON_HTTP,   // Matches non-HTTP(s) requests.
  // Note: Update the default value for `request_methods` field in UrlRule, on
  // adding/removing values from this enum.
}

// The flat representation of a single URL rule. For more details regarding the
// fields please see the comments to url_pattern_index::proto::UrlRule.
table UrlRule {
  // Rule matching options, a bitmask consisting of OptionFlags.
  options : ubyte;

  // A bitmask of ElementType. Equals ElementType_ANY & ~ElementType_MAIN_FRAME
  // by default for compactness. We expect most rules to not use
  // ElementType_MAIN_FRAME. Keep this in sync with
  // url_pattern_index::kDefaultFlatElementTypesMask.
  element_types : ushort = 32767;

  // A bitmask of RequestMethod. Equals RequestMethod_ANY by default.
  request_methods : ushort = 1023;

  // A bitmask of ActivationType. Disables all activation types by default.
  activation_types : ubyte = 0;

  // Use SUBSTRING as default, since it's the most used pattern type. Same as
  // the corresponding proto::UrlRule::url_pattern_type.
  url_pattern_type : UrlPatternType = SUBSTRING;

  // Use NONE as default, since most of the rules are not anchored.
  anchor_left : AnchorType = NONE;
  anchor_right : AnchorType = NONE;

  // The list of initiator and request domains to be included/excluded from the
  // filter's affected set. Each should either be null or have at least a single
  // element. The domains should be in lower-case and kept sorted as defined by
  // url_pattern_index::CompareDomains. The entries must consist of only ascii
  // characters. Use punycode encoding for internationalized domains.
  initiator_domains_included : [string];
  initiator_domains_excluded : [string];
  request_domains_included : [string];
  request_domains_excluded : [string];

  // A URL pattern in the format defined by |url_pattern_type|.  This should
  // only consist of ascii characters, since it's matched against a url where
  // the host is encoded in the punycode format (in case of internationalized
  // domains) and any other non-ascii characters are percent-escaped in utf-8.
  // This should be in lower case if the rule is case-insensitive.
  url_pattern : string;

  // An id which uniquely identifies the rule. Clients must ensure uniqueness if
  // they use this field.
  id : uint;

  // Priority of the rule. Larger the value, greater the priority.
  priority : uint64;

  // Binary blob for any embedder specific rule conditions. When specified,
  // these should be interpreted by the url_pattern_index embedder.
  embedder_conditions : [ubyte];
}

// Contains an N-gram (acting as a key in a hash table) and a list of URL rules
// associated with that N-gram.
table NGramToRules {
  // A string consisting of N (up to 8) ascii-only non-special characters, which
  // are stored in the lowest N non-zero bytes, lower bytes corresponding to
  // later symbols. These are lower-cased to support case-insensitive matching.
  ngram : ulong;

  // The list of rules containing |ngram| as a substring of their URL pattern.
  // Sorted in descending order of rule priority.
  rule_list : [UrlRule];
}

// A data structure used to select only a handful of URL rule candidates that
// need to be matched against a certain resource URL.
table UrlPatternIndex {
  // The N of an N-gram index. Note: |n| should be between 1 and 8.
  n : uint;

  // A hash table with open addressing. The keys of the table are N-grams.
  ngram_index : [NGramToRules];

  // The slot that is pointed to by all empty slots of |ngram_index| hash table.
  // Note: This is a workaround needed because null offsets are not allowed as
  // elements of FlatBuffer arrays.
  ngram_index_empty_slot : NGramToRules;

  // A list storing the rules that doesn't contain any valid N-grams in their
  // URL patterns. Contains all the REGEXP rules as well. Sorted in descending
  // order of rule priority.
  // TODO(pkalinnikov): Think about better implementation for the fallback
  // index. Possibly make it a hash map and maybe merge it with the N-gram
  // index, since we can treat any sequence of characters shorter than N as an
  // N-gram with zero bytes used for padding.
  fallback_rules : [UrlRule];
}

root_type UrlPatternIndex;
chromium/components/url_pattern_index/flat/url_pattern_index.fbs