session_inclusion_rules.cc | Explore in Territory

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "net/device_bound_sessions/session_inclusion_rules.h"

#include <string_view>

#include "base/check.h"
#include "base/containers/adapters.h"
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "net/base/ip_address.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/scheme_host_port_matcher_result.h"
#include "net/base/scheme_host_port_matcher_rule.h"
#include "net/base/url_util.h"
#include "net/device_bound_sessions/proto/storage.pb.h"
#include "net/device_bound_sessions/session.h"

namespace net::device_bound_sessions {

namespace {

bool IsIncludeSiteAllowed(const url::Origin& origin) {
  // This is eTLD+1
  const std::string domain_and_registry =
      registry_controlled_domains::GetDomainAndRegistry(
          origin, registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
  return !domain_and_registry.empty() && origin.host() == domain_and_registry;
}

SessionInclusionRules::InclusionResult AsInclusionResult(bool should_include) {
  return should_include ? SessionInclusionRules::kInclude
                        : SessionInclusionRules::kExclude;
}

// Types of characters valid in IPv6 addresses.
// Derived from logic in url::DoIPv6AddressToNumber() and url::DoParseIPv6().
bool IsValidIPv6Char(char c) {
  return c == ':' || base::IsHexDigit(c) || c == '.' ||
         // 'x' or 'X' is used in IPv4 to denote hex values, and can be used in
         // parts of IPv6 addresses.
         c == 'x' || c == 'X';
}

proto::RuleType GetRuleTypeProto(
    SessionInclusionRules::InclusionResult result) {
  return result == SessionInclusionRules::InclusionResult::kInclude
             ? proto::RuleType::INCLUDE
             : proto::RuleType::EXCLUDE;
}

std::optional<SessionInclusionRules::InclusionResult> GetInclusionResult(
    proto::RuleType proto) {
  if (proto == proto::RuleType::INCLUDE) {
    return SessionInclusionRules::InclusionResult::kInclude;
  } else if (proto == proto::RuleType::EXCLUDE) {
    return SessionInclusionRules::InclusionResult::kExclude;
  }

  // proto = RULE_TYPE_UNSPECIFIED
  return std::nullopt;
}

}  // namespace

// Encapsulates a single rule which applies to the request URL.
struct SessionInclusionRules::UrlRule {
  // URLs that match the rule will be subject to inclusion or exclusion as
  // specified by the type.
  InclusionResult rule_type;

  // Domain or pattern that the URL must match. This must either be a
  // full domain (host piece) or a pattern containing a wildcard in the
  // most-specific (leftmost) label position followed by a dot and a non-eTLD.
  // The matched strings follow SchemeHostPortMatcherRule's logic, but with
  // some extra requirements for validity:
  // - A leading wildcard * must be followed by a dot, so "*ple.com" is not
  //   acceptable.
  // - "*.com" is not accepted because com is an eTLD. Same with "*.co.uk" and
  //   similar.
  // - Multiple wildcards are not allowed.
  // - Internal wildcards are not allowed, so "sub.*.example.com" does not
  //   work because the wildcard is not the leftmost component.
  // - IP addresses also work if specified as the exact host, as described in
  //   SchemeHostPortMatcherRule.
  std::unique_ptr<SchemeHostPortMatcherRule> host_matcher_rule;

  // Prefix consisting of path components that the URL must match. Must begin
  // with '/'. Wildcards are not allowed. Simply use "/" to match all paths.
  std::string path_prefix;

  friend bool operator==(const UrlRule& lhs, const UrlRule& rhs) {
    return lhs.rule_type == rhs.rule_type &&
           lhs.path_prefix == rhs.path_prefix &&
           lhs.host_matcher_rule->ToString() ==
               rhs.host_matcher_rule->ToString();
  }

  // Returns whether the given `url` matches this rule. Note that this
  // function does not check the scheme and port portions of the URL/origin.
  bool MatchesHostAndPath(const GURL& url) const;
};

SessionInclusionRules::SessionInclusionRules(const url::Origin& origin)
    : origin_(origin), may_include_site_(IsIncludeSiteAllowed(origin)) {}

SessionInclusionRules::SessionInclusionRules() = default;

SessionInclusionRules::~SessionInclusionRules() = default;

SessionInclusionRules::SessionInclusionRules(SessionInclusionRules&& other) =
    default;

SessionInclusionRules& SessionInclusionRules::operator=(
    SessionInclusionRules&& other) = default;

bool SessionInclusionRules::operator==(
    const SessionInclusionRules& other) const = default;

void SessionInclusionRules::SetIncludeSite(bool include_site) {
  if (!may_include_site_) {
    return;
  }

  if (!include_site) {
    include_site_.reset();
    return;
  }

  include_site_ = SchemefulSite(origin_);
}

bool SessionInclusionRules::AddUrlRuleIfValid(InclusionResult rule_type,
                                              const std::string& host_pattern,
                                              const std::string& path_prefix) {
  if (path_prefix.empty() || path_prefix.front() != '/') {
    return false;
  }
  if (host_pattern.empty()) {
    return false;
  }

  // If only the origin is allowed, the host_pattern must be precisely its host.
  bool host_pattern_is_host = host_pattern == origin_.host();
  if (!may_include_site_ && !host_pattern_is_host) {
    return false;
  }

  // Don't allow '*' anywhere besides the first character of the pattern.
  size_t star_pos = host_pattern.rfind('*');
  if (star_pos != std::string::npos && star_pos != 0) {
    return false;
  }
  // Only allow wildcard if immediately followed by a dot.
  bool has_initial_wildcard_label = host_pattern.starts_with("*.");
  if (star_pos != std::string::npos && !has_initial_wildcard_label) {
    return false;
  }

  std::string_view hostlike_part{host_pattern};
  if (has_initial_wildcard_label) {
    hostlike_part = hostlike_part.substr(2);
  }

  bool presumed_ipv6 = host_pattern.front() == '[';
  if (presumed_ipv6 && host_pattern.back() != ']') {
    return false;
  }

  // Allow only specific characters into SchemeHostPortMatcherRule parsing.
  if (presumed_ipv6) {
    // Leave out the brackets, but everything else must be a valid char.
    std::string_view ipv6_address{host_pattern.begin() + 1,
                                  host_pattern.end() - 1};
    if (std::find_if_not(ipv6_address.begin(), ipv6_address.end(),
                         &IsValidIPv6Char) != ipv6_address.end()) {
      return false;
    }
  } else {
    // Note that this excludes a ':' character specifying a port number, even
    // though SchemeHostPortMatcherRule supports it. Same for '/' (for the
    // scheme or an IP block).
    // TODO(chlily): Consider supporting port numbers.
    if (!IsCanonicalizedHostCompliant(hostlike_part)) {
      return false;
    }
  }

  // Delegate the rest of the parsing to SchemeHostPortMatcherRule.
  std::unique_ptr<SchemeHostPortMatcherRule> host_matcher_rule =
      SchemeHostPortMatcherRule::FromUntrimmedRawString(host_pattern);
  if (!host_matcher_rule) {
    return false;
  }

  // Now that we know the host_pattern is at least the right shape, validate the
  // remaining restrictions.

  // Skip the eTLD lookups if the host pattern is an exact match.
  if (host_pattern_is_host) {
    url_rules_.emplace_back(rule_type, std::move(host_matcher_rule),
                            path_prefix);
    return true;
  }

  std::string hostlike_part_domain =
      registry_controlled_domains::GetDomainAndRegistry(
          hostlike_part,
          registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
  // If there is a wildcard, we require the pattern to be a normal domain and
  // not an eTLD.
  if (has_initial_wildcard_label && hostlike_part_domain.empty()) {
    return false;
  }

  // Validate that the host pattern is on the right origin/site.
  // TODO(chlily): Perhaps we should use a cached value, but surely URL rule
  // parsing only happens a small number of times.
  std::string domain_and_registry =
      registry_controlled_domains::GetDomainAndRegistry(
          origin_, registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
  // The origin_ must have an eTLD+1, because if it didn't, then we'd know that
  // !may_include_site_, and that would mean we'd have already returned early
  // and would never get here.
  CHECK(!domain_and_registry.empty());
  if (hostlike_part_domain != domain_and_registry) {
    return false;
  }

  url_rules_.emplace_back(rule_type, std::move(host_matcher_rule), path_prefix);
  return true;
}

SessionInclusionRules::InclusionResult
SessionInclusionRules::EvaluateRequestUrl(const GURL& url) const {
  bool same_origin = origin_.IsSameOriginWith(url);
  if (!may_include_site_ && !same_origin) {
    return SessionInclusionRules::kExclude;
  }

  // Evaluate against specific rules, most-recently-added first.
  for (const UrlRule& rule : base::Reversed(url_rules_)) {
    // The rule covers host and path, and scheme is checked too. We don't check
    // port here, because in the !may_include_site_ case that's already covered
    // by being same-origin, and in the may_include_site_ case it's ok for the
    // port to differ.
    if (rule.MatchesHostAndPath(url) &&
        url.scheme_piece() == origin_.scheme()) {
      return rule.rule_type;
    }
  }

  // None of the specific rules apply. Evaluate against the basic include rule.
  if (include_site_) {
    return AsInclusionResult(SchemefulSite(url) == *include_site_);
  }
  return AsInclusionResult(same_origin);
}

bool SessionInclusionRules::UrlRule::MatchesHostAndPath(const GURL& url) const {
  if (host_matcher_rule->Evaluate(url) ==
      SchemeHostPortMatcherResult::kNoMatch) {
    return false;
  }

  std::string_view url_path = url.path_piece();
  if (!url_path.starts_with(path_prefix)) {
    return false;
  }
  // We must check the following to prevent a path prefix like "/foo" from
  // erroneously matching a URL path like "/foobar/baz". There are 2 possible
  // cases: `url_path` may be the same length as `path_prefix`, or `url_path`
  // may be longer than `path_prefix`. In the first case, the two paths are
  // equal and a match has been found. In the second case, we want to know
  // whether the end of the `path_prefix` represents a full label in the path.
  // Either the path_prefix string ends in '/' and is explicitly the end of a
  // label, or the next character of `url_path` beyond the identical portion is
  // '/'. Otherwise, reject the path as a false (incomplete label) prefix match.
  CHECK(url_path.length() >= path_prefix.length());
  if (url_path.length() > path_prefix.length() && path_prefix.back() != '/' &&
      url_path[path_prefix.length()] != '/') {
    return false;
  }

  return true;
}

size_t SessionInclusionRules::num_url_rules_for_testing() const {
  return url_rules_.size();
}

proto::SessionInclusionRules SessionInclusionRules::ToProto() const {
  proto::SessionInclusionRules proto;
  proto.set_origin(origin_.Serialize());
  proto.set_do_include_site(include_site_.has_value());

  // Note that the ordering of the rules (in terms of when they were added to
  // the session) is preserved in the proto. Preserving the ordering is
  // important to handle rules overlap - the latest rule wins.
  for (auto& rule : url_rules_) {
    proto::UrlRule rule_proto;
    rule_proto.set_rule_type(GetRuleTypeProto(rule.rule_type));
    rule_proto.set_host_matcher_rule(rule.host_matcher_rule->ToString());
    rule_proto.set_path_prefix(rule.path_prefix);
    proto.mutable_url_rules()->Add(std::move(rule_proto));
  }

  return proto;
}

// static:
std::unique_ptr<SessionInclusionRules> SessionInclusionRules::CreateFromProto(
    const proto::SessionInclusionRules& proto) {
  if (!proto.has_origin() || !proto.has_do_include_site()) {
    return nullptr;
  }
  url::Origin origin = url::Origin::Create(GURL(proto.origin()));
  if (origin.opaque()) {
    DLOG(ERROR) << "proto origin parse error: " << origin.GetDebugString();
    return nullptr;
  }

  auto result = std::make_unique<SessionInclusionRules>(origin);
  result->SetIncludeSite(proto.do_include_site());
  for (const auto& rule_proto : proto.url_rules()) {
    std::optional<InclusionResult> rule_type =
        GetInclusionResult(rule_proto.rule_type());
    if (!rule_type.has_value() ||
        !result->AddUrlRuleIfValid(*rule_type, rule_proto.host_matcher_rule(),
                                   rule_proto.path_prefix())) {
      DLOG(ERROR) << "proto rule parse error: " << "type:"
                  << proto::RuleType_Name(rule_proto.rule_type()) << " "
                  << "matcher:" << rule_proto.host_matcher_rule() << " "
                  << "prefix:" << rule_proto.path_prefix();
      return nullptr;
    }
  }

  return result;
}

}  // namespace net::device_bound_sessions
chromium/net/device_bound_sessions/session_inclusion_rules.cc