// Copyright 2015 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // url_formatter contains routines for formatting URLs in a way that can be // safely and securely displayed to users. For example, it is responsible // for determining when to convert an IDN A-Label (e.g. "xn--[something]") // into the IDN U-Label. // // Note that this formatting is only intended for display purposes; it would // be insecure and insufficient to make comparisons solely on formatted URLs // (that is, it should not be used for normalizing URLs for comparison for // security decisions). #ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ #define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ #include <stddef.h> #include <stdint.h> #include <string> #include <string_view> #include <vector> #include "base/containers/flat_set.h" #include "base/strings/escape.h" #include "base/strings/utf_offset_string_conversions.h" #include "components/url_formatter/spoof_checks/idn_spoof_checker.h" class GURL; namespace url { struct Component; struct Parsed; } namespace url_formatter { Skeletons; // Used by FormatUrl to specify handling of certain parts of the url. FormatUrlType; FormatUrlTypes; // The result of an IDN to Unicode conversion. struct IDNConversionResult { … }; // Nothing is omitted. extern const FormatUrlType kFormatUrlOmitNothing; // If set, any username and password are removed. extern const FormatUrlType kFormatUrlOmitUsernamePassword; // If the scheme is 'http://', it's removed. extern const FormatUrlType kFormatUrlOmitHTTP; // Omits the path if it is just a slash and there is no query or ref. This is // meaningful for non-file "standard" URLs. extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname; // If the scheme is 'https://', it's removed. Not in kFormatUrlOmitDefaults. extern const FormatUrlType kFormatUrlOmitHTTPS; // Omits some trivially informative subdomains such as "www". Not in // kFormatUrlOmitDefaults. extern const FormatUrlType kFormatUrlOmitTrivialSubdomains; // Omits everything after the host: the path, query, ref, username and password // are all omitted. extern const FormatUrlType kFormatUrlTrimAfterHost; // If the scheme is 'file://', it's removed. Not in kFormatUrlOmitDefaults. extern const FormatUrlType kFormatUrlOmitFileScheme; // If the scheme is 'mailto:', it's removed. Not in kFormatUrlOmitDefaults. extern const FormatUrlType kFormatUrlOmitMailToScheme; // Omits the mobile prefix "m". Not in kFormatUrlOmitDefaults. extern const FormatUrlType kFormatUrlOmitMobilePrefix; // Convenience for omitting all unnecessary types. Does not include HTTPS scheme // removal, or experimental flags. extern const FormatUrlType kFormatUrlOmitDefaults; // Creates a string representation of |url|. The IDN host name is turned to // Unicode if the Unicode representation is deemed safe. |format_type| is a // bitmask of FormatUrlTypes, see it for details. |unescape_rules| defines how // to clean the URL for human readability. You will generally want // |UnescapeRule::SPACES| for display to the user if you can handle spaces, or // |UnescapeRule::NORMAL| if not. If the path part and the query part seem to // be encoded in %-encoded UTF-8, decodes %-encoding and UTF-8. // // The last three parameters may be NULL. // // |new_parsed| will be set to the parsing parameters of the resultant URL. // // |prefix_end| will be the length before the hostname of the resultant URL. // // |offset[s]_for_adjustment| specifies one or more offsets into the original // URL, representing insertion or selection points between characters: if the // input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is // between the scheme and the host, and offset 15 is after the end of the URL. // Valid input offsets range from 0 to the length of the input URL string. On // exit, each offset will have been modified to reflect any changes made to the // output string. For example, if |url| is "http://a:[email protected]/", // |omit_username_password| is true, and an offset is 12 (pointing between 'c' // and '.'), then on return the output string will be "http://c.com/" and the // offset will be 8. If an offset cannot be successfully adjusted (e.g. because // it points into the middle of a component that was entirely removed or into // the middle of an encoding sequence), it will be set to std::u16string::npos. // For consistency, if an input offset points between the scheme and the // username/password, and both are removed, on output this offset will be 0 // rather than npos; this means that offsets at the starts and ends of removed // components are always transformed the same way regardless of what other // components are adjacent. std::u16string FormatUrl(const GURL& url, FormatUrlTypes format_types, base::UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, size_t* offset_for_adjustment); std::u16string FormatUrlWithOffsets( const GURL& url, FormatUrlTypes format_types, base::UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, std::vector<size_t>* offsets_for_adjustment); // This function is like those above except it takes |adjustments| rather // than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all // the transformations that happened to |url| to convert it into the returned // value. std::u16string FormatUrlWithAdjustments( const GURL& url, FormatUrlTypes format_types, base::UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, base::OffsetAdjuster::Adjustments* adjustments); // This is a convenience function for FormatUrl() with // format_types = kFormatUrlOmitDefaults and unescape = SPACES. This is the // typical set of flags for "URLs to display to the user". You should be // cautious about using this for URLs which will be parsed or sent to other // applications. inline std::u16string FormatUrl(const GURL& url) { … } // Returns whether FormatUrl() would strip a trailing slash from |url|, given a // format flag including kFormatUrlOmitTrailingSlashOnBareHostname. bool CanStripTrailingSlash(const GURL& url); // Formats the host in |url| and appends it to |output|. void AppendFormattedHost(const GURL& url, std::u16string* output); // Converts the given host name to unicode characters. This can be called for // any host name, if the input is not IDN or is invalid in some way, we'll just // return the ASCII source so it is still usable. // // The input should be the canonicalized ASCII host name from GURL. This // function does NOT accept UTF-8! std::u16string IDNToUnicode(std::string_view host); // Same as IDNToUnicode, but disables spoof checks and returns more details. // In particular, it doesn't fall back to punycode if |host| fails spoof checks // in IDN spoof checker or is a lookalike of a top domain. // DO NOT use this for displaying URLs. IDNConversionResult UnsafeIDNToUnicodeWithDetails(std::string_view host); // Strips a "www." prefix from |host| if present and if |host| is eligible. // |host| is only eligible for www-stripping if it is not a private or intranet // hostname, and if "www." is part of the subdomain (not the eTLD+1). std::string StripWWW(const std::string& host); // Strips a "m." prefix from |host| if present. std::string StripMobilePrefix(const std::string& text); // If the |host| component of |url| begins with a "www." prefix (and meets the // conditions described for StripWWW), then updates |host| to strip the "www." // prefix. void StripWWWFromHostComponent(const std::string& url, url::Component* host); // Returns skeleton strings computed from |host| for spoof checking. Skeletons GetSkeletons(const std::u16string& host); // Returns a domain from the top 10K list matching the given skeleton. Used for // spoof checking. Different types of skeletons are saved in the skeleton trie. // Providing |type| makes sure the right type of skeletons are looked up. For // example if |skeleton|="googlecorn", |type|="kFull", no match would be found // even though the skeleton is saved in the trie, because the type of this // skeleton in the trie is "kSeparatorsRemoved". TopDomainEntry LookupSkeletonInTopDomains( const std::string& skeleton, const SkeletonType type = SkeletonType::kFull); // Removes diacritics from `host` and returns the new string if the input // only contains Latin-Greek-Cyrillic characters. Otherwise, returns the // input string. std::u16string MaybeRemoveDiacritics(const std::u16string& host); // Returns the first IDNA 2008 deviation character in the `hostname`, if any. // See idn_spoof_checker.h for details about deviation characters. IDNA2008DeviationCharacter GetDeviationCharacter(std::u16string_view hostname); } // namespace url_formatter #endif // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_