chromium/components/url_matcher/url_matcher.cc

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_matcher/url_matcher.h"

#include <algorithm>
#include <iterator>
#include <utility>

#include "base/check.h"
#include "base/containers/contains.h"
#include "base/memory/ptr_util.h"
#include "base/notreached.h"
#include "url/gurl.h"
#include "url/url_canon.h"

MatcherStringPattern;
SubstringSetMatcher;

namespace url_matcher {

// This set of classes implement a mapping of URL Component Patterns, such as
// host_prefix, host_suffix, host_equals, ..., etc., to MatcherStringPatterns
// for use in substring comparisons.
//
// The idea of this mapping is to reduce the problem of comparing many
// URL Component Patterns against one URL to the problem of searching many
// substrings in one string:
//
// ----------------------                    ------------------------
// | URL Query operator | ----translate----> | MatcherStringPattern |
// ----------------------                    ------------------------
//                                                   ^
//                                                   |
//                                                compare
//                                                   |
//                                                   v
// ----------------------                    -----------------
// | URL to compare     |                    |               |
// | to all URL Query   | ----translate----> | String        |
// | operators          |                    |               |
// ----------------------                    -----------------
//
// The reason for this problem reduction is that there are efficient algorithms
// for searching many substrings in one string (see Aho-Corasick algorithm).
//
// Additionally, some of the same pieces are reused to implement regular
// expression comparisons. The FilteredRE2 implementation for matching many
// regular expressions against one string uses prefiltering, in which a set
// of substrings (derived from the regexes) are first searched for, to reduce
// the number of regular expressions to test; the prefiltering step also
// uses Aho-Corasick.
//
// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
// ==========================================================
//
// For searches in this class, we normalize URLs as follows:
//
// Step 1:
// Remove scheme, port and segment from URL:
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
//    www.example.com/index.html?search=foo
//
// We remove the scheme and port number because they can be checked later
// in a secondary filter step. We remove the segment (the #... part) because
// this is not guaranteed to be ASCII-7 encoded.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - ED = End of Domain
// - EP = End of Path
// - EU = End of URL
// Furthermore, the hostname is canonicalized to start with a ".".
//
// Position markers are represented as characters >127, which are therefore
// guaranteed not to be part of the ASCII-7 encoded URL character set.
//
// -> www.example.com/index.html?search=foo becomes
// BU .www.example.com ED /index.html EP ?search=foo EU
//
// -> www.example.com/index.html becomes
// BU .www.example.com ED /index.html EP EU
//
// Step 3:
// Translate URL Component Patterns as follows:
//
// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
// -> host_prefix("www.example") = BU .www.example
//
// host_suffix(suffix) = suffix ED
// -> host_suffix("example.com") = example.com ED
// -> host_suffix(".example.com") = .example.com ED
//
// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
// -> host_equals("www.example.com") = BU .www.example.com ED
//
// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
//
// With this, we can search the MatcherStringPatterns in the normalized URL.
//
//
// Case 2: url_{prefix,suffix,equals,contains} searches.
// =====================================================
//
// Step 1: as above, except that
// - the scheme is not removed
// - the port is not removed if it is specified and does not match the default
//   port for the given scheme.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - EU = End of URL
//
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
// BU http://www.example.com:8080/index.html?search=foo EU
// -> http://www.example.com:80/index.html?search=foo#first_match becomes
// BU http://www.example.com/index.html?search=foo EU
//
// url_prefix(prefix) = BU prefix
// -> url_prefix("http://www.example") = BU http://www.example
//
// url_contains(substring) = substring
// -> url_contains("index") = index
//
//
// Case 3: {host,path,query}_contains searches.
// ============================================
//
// These kinds of searches are not supported directly but can be derived
// by a combination of a url_contains() query followed by an explicit test:
//
// host_contains(str) = url_contains(str) followed by test whether str occurs
//   in host component of original URL.
// -> host_contains("example.co") = example.co
//    followed by gurl.host().find("example.co");
//
// [similarly for path_contains and query_contains].
//
//
// Regular expression matching (url_matches searches)
// ==================================================
//
// This class also supports matching regular expressions (RE2 syntax)
// against full URLs, which are transformed as in case 2.

namespace {

bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {}

bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {}

bool IsMatcherEmpty(const std::unique_ptr<SubstringSetMatcher>& matcher) {}

}  // namespace

//
// URLMatcherCondition
//

URLMatcherCondition::URLMatcherCondition()
    :{}

URLMatcherCondition::~URLMatcherCondition() {}

URLMatcherCondition::URLMatcherCondition(
    Criterion criterion,
    const MatcherStringPattern* string_pattern)
    :{}

URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    :{}

URLMatcherCondition& URLMatcherCondition::operator=(
    const URLMatcherCondition& rhs) {}

bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {}

bool URLMatcherCondition::IsFullURLCondition() const {}

bool URLMatcherCondition::IsRegexCondition() const {}

bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {}

bool URLMatcherCondition::IsMatch(
    const std::set<MatcherStringPattern::ID>& matching_patterns,
    const GURL& url) const {}

//
// URLMatcherConditionFactory
//

namespace {
// These are symbols that are not contained in 7-bit ASCII used in GURLs.
const char kBeginningOfURL[] =;
const char kEndOfDomain[] =;
const char kEndOfPath[] =;
const char kQueryComponentDelimiter[] =;
const char kEndOfURL[] =;

// The delimiter for query parameters
const char kQuerySeparator =;
}  // namespace

URLMatcherConditionFactory::URLMatcherConditionFactory() = default;

URLMatcherConditionFactory::~URLMatcherConditionFactory() = default;

std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    const GURL& url) const {}

URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    const std::string& prefix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    const std::string& suffix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    const std::string& prefix) {}

URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    const std::string& suffix) {}

URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    const std::string& prefix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    const std::string& suffix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    const std::string& str) {}

URLMatcherCondition
URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    const std::string& host_suffix,
    const std::string& path_prefix) {}

URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    const std::string& host,
    const std::string& path_prefix) {}

std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    const GURL& url) const {}

static std::string CanonicalizeURLForRegexSearchesHelper(const GURL& url,
                                                         bool clear_query) {}

std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    const GURL& url) const {}

std::string
URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    const GURL& url) const {}

URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    const std::string& prefix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    const std::string& suffix) {}

URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    const std::string& str) {}

URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    const std::string& regex) {}

URLMatcherCondition
URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    const std::string& regex) {}

void URLMatcherConditionFactory::ForgetUnusedPatterns(
    const std::set<MatcherStringPattern::ID>& used_patterns) {}

bool URLMatcherConditionFactory::IsEmpty() const {}

URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    URLMatcherCondition::Criterion criterion,
    const std::string& pattern) {}

std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
    const std::string& suffix) const {}

std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
    const std::string& prefix) const {}

std::string URLMatcherConditionFactory::CanonicalizeHostname(
    const std::string& hostname) const {}

// This function prepares the query string by replacing query separator with a
// magic value (|kQueryComponentDelimiter|). When the boolean
// |prepend_beginning_of_query_component| is true the function prepends the
// query with the same magic. This is done to locate the start of a key value
// pair in the query string. The parameter |query| is passed by value
// intentionally, since it is locally modified.
std::string URLMatcherConditionFactory::CanonicalizeQuery(
    std::string query,
    bool prepend_beginning_of_query_component,
    bool append_end_of_query_component) const {}

base::MatcherStringPattern::ID URLMatcherConditionFactory::GetNextID() {}

bool URLMatcherConditionFactory::MatcherStringPatternPointerCompare::operator()(
    MatcherStringPattern* lhs,
    MatcherStringPattern* rhs) const {}

//
// URLQueryElementMatcherCondition
//

URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
    const std::string& key,
    const std::string& value,
    QueryValueMatchType query_value_match_type,
    QueryElementType query_element_type,
    Type match_type,
    URLMatcherConditionFactory* factory) {}

URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
    const URLQueryElementMatcherCondition& other) = default;

URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}

bool URLQueryElementMatcherCondition::operator<(
    const URLQueryElementMatcherCondition& rhs) const {}

bool URLQueryElementMatcherCondition::IsMatch(
    const std::string& url_for_component_searches) const {}

//
// URLMatcherSchemeFilter
//

URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    :{}

URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    const std::vector<std::string>& filters)
    :{}

URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}

bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {}

//
// URLMatcherPortFilter
//

URLMatcherPortFilter::URLMatcherPortFilter(
    const std::vector<URLMatcherPortFilter::Range>& ranges)
    :{}

URLMatcherPortFilter::~URLMatcherPortFilter() {}

bool URLMatcherPortFilter::IsMatch(const GURL& url) const {}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
                                                              int to) {}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {}

//
// URLMatcherCidrBlockFilter
//

URLMatcherCidrBlockFilter::URLMatcherCidrBlockFilter(
    const std::vector<URLMatcherCidrBlockFilter::CidrBlock>& cidr_blocks)
    :{}

URLMatcherCidrBlockFilter::~URLMatcherCidrBlockFilter() = default;

bool URLMatcherCidrBlockFilter::IsMatch(const GURL& url) const {}

// static
base::expected<URLMatcherCidrBlockFilter::CidrBlock, std::string>
URLMatcherCidrBlockFilter::CreateCidrBlock(const std::string& entry) {}

//
// URLMatcherConditionSet
//

URLMatcherConditionSet::~URLMatcherConditionSet() {}

URLMatcherConditionSet::URLMatcherConditionSet(
    base::MatcherStringPattern::ID id,
    const Conditions& conditions)
    :{}

URLMatcherConditionSet::URLMatcherConditionSet(
    base::MatcherStringPattern::ID id,
    const Conditions& conditions,
    std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
    std::unique_ptr<URLMatcherPortFilter> port_filter,
    std::unique_ptr<URLMatcherCidrBlockFilter> cidr_block_filter)
    :{}

URLMatcherConditionSet::URLMatcherConditionSet(
    base::MatcherStringPattern::ID id,
    const Conditions& conditions,
    const QueryConditions& query_conditions,
    std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
    std::unique_ptr<URLMatcherPortFilter> port_filter)
    :{}

bool URLMatcherConditionSet::IsMatch(
    const std::set<MatcherStringPattern::ID>& matching_patterns,
    const GURL& url) const {}

bool URLMatcherConditionSet::IsMatch(
    const std::set<MatcherStringPattern::ID>& matching_patterns,
    const GURL& url,
    const std::string& url_for_component_searches) const {}

//
// URLMatcher
//

URLMatcher::URLMatcher() {}

URLMatcher::~URLMatcher() {}

void URLMatcher::AddConditionSets(
    const URLMatcherConditionSet::Vector& condition_sets) {}

void URLMatcher::RemoveConditionSets(
    const std::vector<base::MatcherStringPattern::ID>& condition_set_ids) {}

void URLMatcher::ClearUnusedConditionSets() {}

std::set<base::MatcherStringPattern::ID> URLMatcher::MatchURL(
    const GURL& url) const {}

bool URLMatcher::IsEmpty() const {}

void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {}

void URLMatcher::UpdateRegexSetMatcher() {}

void URLMatcher::UpdateTriggers() {}

void URLMatcher::UpdateConditionFactory() {}

void URLMatcher::UpdateInternalDatastructures() {}

}  // namespace url_matcher