escape.cc | Explore in Territory

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
#pragma allow_unsafe_buffers
#endif

#include "base/strings/escape.h"

#include <ostream>
#include <string_view>

#include "base/check_op.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/strings/utf_string_conversions.h"
#include "base/third_party/icu/icu_utf.h"

namespace base {

namespace {

// A fast bit-vector map for ascii characters.
//
// Internally stores 256 bits in an array of 8 ints.
// Does quick bit-flicking to lookup needed characters.
struct Charmap { … };

// Given text to escape and a Charmap defining which values to escape,
// return an escaped string.  If use_plus is true, spaces are converted
// to +, otherwise, if spaces are in the charmap, they are converted to
// %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
// '%' is in the charmap, it is converted to %25.
std::string Escape(std::string_view text,
                   const Charmap& charmap,
                   bool use_plus,
                   bool keep_escaped = false) { … }

// Convert a character |c| to a form that will not be mistaken as HTML.
template <class str>
void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { … }

// Convert |input| string to a form that will not be interpreted as HTML.
template <typename T, typename CharT = typename T::value_type>
std::basic_string<CharT> EscapeForHTMLImpl(T input) { … }

// Everything except alphanumerics and -._~
// See RFC 3986 for the list of unreserved characters.
static const Charmap kUnreservedCharmap = …;

// Everything except alphanumerics and !'()*-._~
// See RFC 2396 for the list of reserved characters.
static const Charmap kQueryCharmap = …;

// non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
static const Charmap kPathCharmap = …;

#if BUILDFLAG(IS_APPLE)
// non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
                                       0xb8000001L, 0xffffffffL, 0xffffffffL,
                                       0xffffffffL, 0xffffffffL}};
#endif  // BUILDFLAG(IS_APPLE)

// non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
static const Charmap kUrlEscape = …;

// non-7bit, as well as %.
static const Charmap kNonASCIICharmapAndPercent = …;

// non-7bit
static const Charmap kNonASCIICharmap = …;

// Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
// !'()*-._~#[]
static const Charmap kExternalHandlerCharmap = …;

// Contains nonzero when the corresponding character is unescapable for normal
// URLs. These characters are the ones that may change the parsing of a URL, so
// we don't want to unescape them sometimes. In many case we won't want to
// unescape spaces, but that is controlled by parameters to Unescape*.
//
// The basic rule is that we can't unescape anything that would changing parsing
// like # or ?. We also can't unescape &, =, or + since that could be part of a
// query and that could change the server's parsing of the query. Nor can we
// unescape \ since src/url/ will convert it to a /.
//
// Lastly, we can't unescape anything that doesn't have a canonical
// representation in a URL. This means that unescaping will change the URL, and
// you could get different behavior if you copy and paste the URL, or press
// enter in the URL bar. The list of characters that fall into this category
// are the ones labeled PASS (allow either escaped or unescaped) in the big
// lookup table at the top of url/url_canon_path.cc.  Also, characters
// that have CHAR_QUERY set in url/url_canon_internal.cc but are not
// allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
// not unescaped, to avoid turning a valid url according to spec into an
// invalid one.
// clang-format off
const char kUrlUnescape[128] = …;
// clang-format on

// Attempts to unescape the sequence at |index| within |escaped_text|.  If
// successful, sets |value| to the unescaped value.  Returns whether
// unescaping succeeded.
bool UnescapeUnsignedByteAtIndex(std::string_view escaped_text,
                                 size_t index,
                                 unsigned char* value) { … }

// Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
// the specified index. On success, returns true, sets |code_point_out| to be
// the character's code point and |unescaped_out| to be the unescaped UTF-8
// string. |unescaped_out| will always be 1/3rd the length of the substring of
// |escaped_text| that corresponds to the unescaped character.
bool UnescapeUTF8CharacterAtIndex(std::string_view escaped_text,
                                  size_t index,
                                  base_icu::UChar32* code_point_out,
                                  std::string* unescaped_out) { … }

// This method takes a Unicode code point and returns true if it should be
// unescaped, based on |rules|.
bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
                             base_icu::UChar32 code_point) { … }

// Unescapes |escaped_text| according to |rules|, returning the resulting
// string.  Fills in an |adjustments| parameter, if non-nullptr, so it reflects
// the alterations done to the string that are not one-character-to-one-
// character.  The resulting |adjustments| will always be sorted by increasing
// offset.
std::string UnescapeURLWithAdjustmentsImpl(
    std::string_view escaped_text,
    UnescapeRule::Type rules,
    OffsetAdjuster::Adjustments* adjustments) { … }

}  // namespace

std::string EscapeAllExceptUnreserved(std::string_view text) { … }

std::string EscapeQueryParamValue(std::string_view text, bool use_plus) { … }

std::string EscapePath(std::string_view path) { … }

#if BUILDFLAG(IS_APPLE)
std::string EscapeNSURLPrecursor(std::string_view precursor) {
  return Escape(precursor, kNSURLCharmap, false, true);
}
#endif  // BUILDFLAG(IS_APPLE)

std::string EscapeUrlEncodedData(std::string_view path, bool use_plus) { … }

std::string EscapeNonASCIIAndPercent(std::string_view input) { … }

std::string EscapeNonASCII(std::string_view input) { … }

std::string EscapeExternalHandlerValue(std::string_view text) { … }

void AppendEscapedCharForHTML(char c, std::string* output) { … }

std::string EscapeForHTML(std::string_view input) { … }

std::u16string EscapeForHTML(std::u16string_view input) { … }

std::string UnescapeURLComponent(std::string_view escaped_text,
                                 UnescapeRule::Type rules) { … }

std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
    std::string_view text,
    UnescapeRule::Type rules,
    OffsetAdjuster::Adjustments* adjustments) { … }

std::string UnescapeBinaryURLComponent(std::string_view escaped_text,
                                       UnescapeRule::Type rules) { … }

bool UnescapeBinaryURLComponentSafe(std::string_view escaped_text,
                                    bool fail_on_path_separators,
                                    std::string* unescaped_text) { … }

bool ContainsEncodedBytes(std::string_view escaped_text,
                          const std::set<unsigned char>& bytes) { … }

std::u16string UnescapeForHTML(std::u16string_view input) { … }

}  // namespace base
chromium/base/strings/escape.cc