// Copyright 2020 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40284755): Remove this and spanify to fix the errors. #pragma allow_unsafe_buffers #endif #include "base/strings/escape.h" #include <ostream> #include <string_view> #include "base/check_op.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversion_utils.h" #include "base/strings/utf_string_conversions.h" #include "base/third_party/icu/icu_utf.h" namespace base { namespace { // A fast bit-vector map for ascii characters. // // Internally stores 256 bits in an array of 8 ints. // Does quick bit-flicking to lookup needed characters. struct Charmap { … }; // Given text to escape and a Charmap defining which values to escape, // return an escaped string. If use_plus is true, spaces are converted // to +, otherwise, if spaces are in the charmap, they are converted to // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if // '%' is in the charmap, it is converted to %25. std::string Escape(std::string_view text, const Charmap& charmap, bool use_plus, bool keep_escaped = false) { … } // Convert a character |c| to a form that will not be mistaken as HTML. template <class str> void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { … } // Convert |input| string to a form that will not be interpreted as HTML. template <typename T, typename CharT = typename T::value_type> std::basic_string<CharT> EscapeForHTMLImpl(T input) { … } // Everything except alphanumerics and -._~ // See RFC 3986 for the list of unreserved characters. static const Charmap kUnreservedCharmap = …; // Everything except alphanumerics and !'()*-._~ // See RFC 2396 for the list of reserved characters. static const Charmap kQueryCharmap = …; // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} static const Charmap kPathCharmap = …; #if BUILDFLAG(IS_APPLE) // non-printable, non-7bit, and (including space) "#%<>[\]^`{|} static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L, 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL}}; #endif // BUILDFLAG(IS_APPLE) // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} static const Charmap kUrlEscape = …; // non-7bit, as well as %. static const Charmap kNonASCIICharmapAndPercent = …; // non-7bit static const Charmap kNonASCIICharmap = …; // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and // !'()*-._~#[] static const Charmap kExternalHandlerCharmap = …; // Contains nonzero when the corresponding character is unescapable for normal // URLs. These characters are the ones that may change the parsing of a URL, so // we don't want to unescape them sometimes. In many case we won't want to // unescape spaces, but that is controlled by parameters to Unescape*. // // The basic rule is that we can't unescape anything that would changing parsing // like # or ?. We also can't unescape &, =, or + since that could be part of a // query and that could change the server's parsing of the query. Nor can we // unescape \ since src/url/ will convert it to a /. // // Lastly, we can't unescape anything that doesn't have a canonical // representation in a URL. This means that unescaping will change the URL, and // you could get different behavior if you copy and paste the URL, or press // enter in the URL bar. The list of characters that fall into this category // are the ones labeled PASS (allow either escaped or unescaped) in the big // lookup table at the top of url/url_canon_path.cc. Also, characters // that have CHAR_QUERY set in url/url_canon_internal.cc but are not // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are // not unescaped, to avoid turning a valid url according to spec into an // invalid one. // clang-format off const char kUrlUnescape[128] = …; // clang-format on // Attempts to unescape the sequence at |index| within |escaped_text|. If // successful, sets |value| to the unescaped value. Returns whether // unescaping succeeded. bool UnescapeUnsignedByteAtIndex(std::string_view escaped_text, size_t index, unsigned char* value) { … } // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at // the specified index. On success, returns true, sets |code_point_out| to be // the character's code point and |unescaped_out| to be the unescaped UTF-8 // string. |unescaped_out| will always be 1/3rd the length of the substring of // |escaped_text| that corresponds to the unescaped character. bool UnescapeUTF8CharacterAtIndex(std::string_view escaped_text, size_t index, base_icu::UChar32* code_point_out, std::string* unescaped_out) { … } // This method takes a Unicode code point and returns true if it should be // unescaped, based on |rules|. bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, base_icu::UChar32 code_point) { … } // Unescapes |escaped_text| according to |rules|, returning the resulting // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects // the alterations done to the string that are not one-character-to-one- // character. The resulting |adjustments| will always be sorted by increasing // offset. std::string UnescapeURLWithAdjustmentsImpl( std::string_view escaped_text, UnescapeRule::Type rules, OffsetAdjuster::Adjustments* adjustments) { … } } // namespace std::string EscapeAllExceptUnreserved(std::string_view text) { … } std::string EscapeQueryParamValue(std::string_view text, bool use_plus) { … } std::string EscapePath(std::string_view path) { … } #if BUILDFLAG(IS_APPLE) std::string EscapeNSURLPrecursor(std::string_view precursor) { return Escape(precursor, kNSURLCharmap, false, true); } #endif // BUILDFLAG(IS_APPLE) std::string EscapeUrlEncodedData(std::string_view path, bool use_plus) { … } std::string EscapeNonASCIIAndPercent(std::string_view input) { … } std::string EscapeNonASCII(std::string_view input) { … } std::string EscapeExternalHandlerValue(std::string_view text) { … } void AppendEscapedCharForHTML(char c, std::string* output) { … } std::string EscapeForHTML(std::string_view input) { … } std::u16string EscapeForHTML(std::u16string_view input) { … } std::string UnescapeURLComponent(std::string_view escaped_text, UnescapeRule::Type rules) { … } std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments( std::string_view text, UnescapeRule::Type rules, OffsetAdjuster::Adjustments* adjustments) { … } std::string UnescapeBinaryURLComponent(std::string_view escaped_text, UnescapeRule::Type rules) { … } bool UnescapeBinaryURLComponentSafe(std::string_view escaped_text, bool fail_on_path_separators, std::string* unescaped_text) { … } bool ContainsEncodedBytes(std::string_view escaped_text, const std::set<unsigned char>& bytes) { … } std::u16string UnescapeForHTML(std::u16string_view input) { … } } // namespace base