strutil.cc | Explore in Territory

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// from google3/strings/strutil.cc

#include <google/protobuf/stubs/strutil.h>

#include <errno.h>
#include <float.h>    // FLT_DIG and DBL_DIG
#include <limits.h>
#include <stdio.h>
#include <cmath>
#include <iterator>
#include <limits>

#include <google/protobuf/stubs/logging.h>
#include <google/protobuf/stubs/stl_util.h>

#ifdef _WIN32
// MSVC has only _snprintf, not snprintf.
//
// MinGW has both snprintf and _snprintf, but they appear to be different
// functions.  The former is buggy.  When invoked like so:
//   char buffer[32];
//   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
// it prints "1.23000e+10".  This is plainly wrong:  %g should never print
// trailing zeros after the decimal point.  For some reason this bug only
// occurs with some input values, not all.  In any case, _snprintf does the
// right thing, so we use it.
#define snprintf …
#endif

namespace google {
namespace protobuf {

// These are defined as macros on some platforms.  #undef them so that we can
// redefine them.
#undef isxdigit
#undef isprint

// The definitions of these in ctype.h change based on locale.  Since our
// string manipulation is all in relation to the protocol buffer and C++
// languages, we always want to use the C locale.  So, we re-define these
// exactly as we want them.
inline bool isxdigit(char c) { … }

inline bool isprint(char c) { … }

// ----------------------------------------------------------------------
// ReplaceCharacters
//    Replaces any occurrence of the character 'remove' (or the characters
//    in 'remove') with the character 'replacewith'.
// ----------------------------------------------------------------------
void ReplaceCharacters(std::string *s, const char *remove, char replacewith) { … }

void StripWhitespace(std::string *str) { … }

// ----------------------------------------------------------------------
// StringReplace()
//    Replace the "old" pattern with the "new" pattern in a string,
//    and append the result to "res".  If replace_all is false,
//    it only replaces the first instance of "old."
// ----------------------------------------------------------------------

void StringReplace(const std::string &s, const std::string &oldsub,
                   const std::string &newsub, bool replace_all,
                   std::string *res) { … }

// ----------------------------------------------------------------------
// StringReplace()
//    Give me a string and two patterns "old" and "new", and I replace
//    the first instance of "old" in the string with "new", if it
//    exists.  If "global" is true; call this repeatedly until it
//    fails.  RETURN a new string, regardless of whether the replacement
//    happened or not.
// ----------------------------------------------------------------------

std::string StringReplace(const std::string &s, const std::string &oldsub,
                          const std::string &newsub, bool replace_all) { … }

// ----------------------------------------------------------------------
// SplitStringUsing()
//    Split a string using a character delimiter. Append the components
//    to 'result'.
//
// Note: For multi-character delimiters, this routine will split on *ANY* of
// the characters in the string, not the entire string as a single delimiter.
// ----------------------------------------------------------------------
template <typename ITR>
static inline void SplitStringToIteratorUsing(StringPiece full,
                                              const char *delim, ITR &result) { … }

void SplitStringUsing(StringPiece full, const char *delim,
                      std::vector<std::string> *result) { … }

// Split a string using a character delimiter. Append the components
// to 'result'.  If there are consecutive delimiters, this function
// will return corresponding empty strings. The string is split into
// at most the specified number of pieces greedily. This means that the
// last piece may possibly be split further. To split into as many pieces
// as possible, specify 0 as the number of pieces.
//
// If "full" is the empty string, yields an empty string as the only value.
//
// If "pieces" is negative for some reason, it returns the whole string
// ----------------------------------------------------------------------
template <typename ITR>
static inline void SplitStringToIteratorAllowEmpty(StringPiece full,
                                                   const char *delim,
                                                   int pieces, ITR &result) { … }

void SplitStringAllowEmpty(StringPiece full, const char *delim,
                           std::vector<std::string> *result) { … }

// ----------------------------------------------------------------------
// JoinStrings()
//    This merges a vector of string components with delim inserted
//    as separaters between components.
//
// ----------------------------------------------------------------------
template <class ITERATOR>
static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end,
                                const char *delim, std::string *result) { … }

void JoinStrings(const std::vector<std::string> &components, const char *delim,
                 std::string *result) { … }

// ----------------------------------------------------------------------
// UnescapeCEscapeSequences()
//    This does all the unescaping that C does: \ooo, \r, \n, etc
//    Returns length of resulting string.
//    The implementation of \x parses any positive number of hex digits,
//    but it is an error if the value requires more than 8 bits, and the
//    result is truncated to 8 bits.
//
//    The second call stores its errors in a supplied string vector.
//    If the string vector pointer is nullptr, it reports the errors with LOG().
// ----------------------------------------------------------------------

#define IS_OCTAL_DIGIT(c) …

// Protocol buffers doesn't ever care about errors, but I don't want to remove
// the code.
#define LOG_STRING(LEVEL, VECTOR) …

int UnescapeCEscapeSequences(const char* source, char* dest) { … }

int UnescapeCEscapeSequences(const char *source, char *dest,
                             std::vector<std::string> *errors) { … }

// ----------------------------------------------------------------------
// UnescapeCEscapeString()
//    This does the same thing as UnescapeCEscapeSequences, but creates
//    a new string. The caller does not need to worry about allocating
//    a dest buffer. This should be used for non performance critical
//    tasks such as printing debug messages. It is safe for src and dest
//    to be the same.
//
//    The second call stores its errors in a supplied string vector.
//    If the string vector pointer is nullptr, it reports the errors with LOG().
//
//    In the first and second calls, the length of dest is returned. In the
//    the third call, the new string is returned.
// ----------------------------------------------------------------------
int UnescapeCEscapeString(const std::string &src, std::string *dest) { … }

int UnescapeCEscapeString(const std::string &src, std::string *dest,
                          std::vector<std::string> *errors) { … }

std::string UnescapeCEscapeString(const std::string &src) { … }

// ----------------------------------------------------------------------
// CEscapeString()
// CHexEscapeString()
//    Copies 'src' to 'dest', escaping dangerous characters using
//    C-style escape sequences. This is very useful for preparing query
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
//    hexadecimal rather than octal sequences.
//    Returns the number of bytes written to 'dest' (not including the \0)
//    or -1 if there was insufficient space.
//
//    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
// ----------------------------------------------------------------------
int CEscapeInternal(const char* src, int src_len, char* dest,
                    int dest_len, bool use_hex, bool utf8_safe) { … }

// Calculates the length of the C-style escaped version of 'src'.
// Assumes that non-printable characters are escaped using octal sequences, and
// that UTF-8 bytes are not handled specially.
static inline size_t CEscapedLength(StringPiece src) { … }

// ----------------------------------------------------------------------
// Escapes 'src' using C-style escape sequences, and appends the escaped string
// to 'dest'. This version is faster than calling CEscapeInternal as it computes
// the required space using a lookup table, and also does not do any special
// handling for Hex or UTF-8 characters.
// ----------------------------------------------------------------------
void CEscapeAndAppend(StringPiece src, std::string *dest) { … }

std::string CEscape(const std::string &src) { … }

namespace strings {

std::string Utf8SafeCEscape(const std::string &src) { … }

std::string CHexEscape(const std::string &src) { … }

}  // namespace strings

// ----------------------------------------------------------------------
// strto32_adaptor()
// strtou32_adaptor()
//    Implementation of strto[u]l replacements that have identical
//    overflow and underflow characteristics for both ILP-32 and LP-64
//    platforms, including errno preservation in error-free calls.
// ----------------------------------------------------------------------

int32_t strto32_adaptor(const char *nptr, char **endptr, int base) { … }

uint32_t strtou32_adaptor(const char *nptr, char **endptr, int base) { … }

inline bool safe_parse_sign(std::string *text /*inout*/,
                            bool *negative_ptr /*output*/) { … }

template <typename IntType>
bool safe_parse_positive_int(std::string text, IntType *value_p) { … }

template <typename IntType>
bool safe_parse_negative_int(const std::string &text, IntType *value_p) { … }

template <typename IntType>
bool safe_int_internal(std::string text, IntType *value_p) { … }

template <typename IntType>
bool safe_uint_internal(std::string text, IntType *value_p) { … }

// ----------------------------------------------------------------------
// FastIntToBuffer()
// FastInt64ToBuffer()
// FastHexToBuffer()
// FastHex64ToBuffer()
// FastHex32ToBuffer()
// ----------------------------------------------------------------------

// Offset into buffer where FastInt64ToBuffer places the end of string
// null character.  Also used by FastInt64ToBufferLeft.
static const int kFastInt64ToBufferOffset = …;

char *FastInt64ToBuffer(int64_t i, char* buffer) { … }

// Offset into buffer where FastInt32ToBuffer places the end of string
// null character.  Also used by FastInt32ToBufferLeft
static const int kFastInt32ToBufferOffset = …;

// Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
// compiler to generate 32 bit arithmetic instructions.  It's much faster, at
// least with 32 bit binaries.
char *FastInt32ToBuffer(int32_t i, char* buffer) { … }

char *FastHexToBuffer(int i, char* buffer) { … }

char *InternalFastHexToBuffer(uint64_t value, char* buffer, int num_byte) { … }

char *FastHex64ToBuffer(uint64_t value, char* buffer) { … }

char *FastHex32ToBuffer(uint32_t value, char* buffer) { … }

// ----------------------------------------------------------------------
// FastInt32ToBufferLeft()
// FastUInt32ToBufferLeft()
// FastInt64ToBufferLeft()
// FastUInt64ToBufferLeft()
//
// Like the Fast*ToBuffer() functions above, these are intended for speed.
// Unlike the Fast*ToBuffer() functions, however, these functions write
// their output to the beginning of the buffer (hence the name, as the
// output is left-aligned).  The caller is responsible for ensuring that
// the buffer has enough space to hold the output.
//
// Returns a pointer to the end of the string (i.e. the null character
// terminating the string).
// ----------------------------------------------------------------------

static const char two_ASCII_digits[100][2] = …;

char* FastUInt32ToBufferLeft(uint32_t u, char* buffer) { … }

char* FastInt32ToBufferLeft(int32_t i, char* buffer) { … }

char* FastUInt64ToBufferLeft(uint64_t u64, char* buffer) { … }

char* FastInt64ToBufferLeft(int64_t i, char* buffer) { … }

// ----------------------------------------------------------------------
// SimpleItoa()
//    Description: converts an integer to a string.
//
//    Return value: string
// ----------------------------------------------------------------------

std::string SimpleItoa(int i) { … }

std::string SimpleItoa(unsigned int i) { … }

std::string SimpleItoa(long i) { … }

std::string SimpleItoa(unsigned long i) { … }

std::string SimpleItoa(long long i) { … }

std::string SimpleItoa(unsigned long long i) { … }

// ----------------------------------------------------------------------
// SimpleDtoa()
// SimpleFtoa()
// DoubleToBuffer()
// FloatToBuffer()
//    We want to print the value without losing precision, but we also do
//    not want to print more digits than necessary.  This turns out to be
//    trickier than it sounds.  Numbers like 0.2 cannot be represented
//    exactly in binary.  If we print 0.2 with a very large precision,
//    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
//    On the other hand, if we set the precision too low, we lose
//    significant digits when printing numbers that actually need them.
//    It turns out there is no precision value that does the right thing
//    for all numbers.
//
//    Our strategy is to first try printing with a precision that is never
//    over-precise, then parse the result with strtod() to see if it
//    matches.  If not, we print again with a precision that will always
//    give a precise result, but may use more digits than necessary.
//
//    An arguably better strategy would be to use the algorithm described
//    in "How to Print Floating-Point Numbers Accurately" by Steele &
//    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
//    however, that the following implementation is about as fast as
//    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
//    will not scale well on multi-core machines.  DMG's code is slightly
//    more accurate (in that it will never use more digits than
//    necessary), but this is probably irrelevant for most users.
//
//    Rob Pike and Ken Thompson also have an implementation of dtoa() in
//    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
//    one in that it makes guesses and then uses strtod() to check them.
//    Their implementation is faster because they use their own code to
//    generate the digits in the first place rather than use snprintf(),
//    thus avoiding format string parsing overhead.  However, this makes
//    it considerably more complicated than the following implementation,
//    and it is embedded in a larger library.  If speed turns out to be
//    an issue, we could re-implement this in terms of their
//    implementation.
// ----------------------------------------------------------------------

std::string SimpleDtoa(double value) { … }

std::string SimpleFtoa(float value) { … }

static inline bool IsValidFloatChar(char c) { … }

void DelocalizeRadix(char* buffer) { … }

char* DoubleToBuffer(double value, char* buffer) { … }

static int memcasecmp(const char *s1, const char *s2, size_t len) { … }

inline bool CaseEqual(StringPiece s1, StringPiece s2) { … }

bool safe_strtob(StringPiece str, bool* value) { … }

bool safe_strtof(const char* str, float* value) { … }

bool safe_strtod(const char* str, double* value) { … }

bool safe_strto32(const std::string &str, int32_t *value) { … }

bool safe_strtou32(const std::string &str, uint32_t *value) { … }

bool safe_strto64(const std::string &str, int64_t *value) { … }

bool safe_strtou64(const std::string &str, uint64_t *value) { … }

char* FloatToBuffer(float value, char* buffer) { … }

namespace strings {

AlphaNum::AlphaNum(strings::Hex hex) { … }

}  // namespace strings

// ----------------------------------------------------------------------
// StrCat()
//    This merges the given strings or integers, with no delimiter.  This
//    is designed to be the fastest possible way to construct a string out
//    of a mix of raw C strings, C++ strings, and integer values.
// ----------------------------------------------------------------------

// Append is merely a version of memcpy that returns the address of the byte
// after the area just overwritten.  It comes in multiple flavors to minimize
// call overhead.
static char *Append1(char *out, const AlphaNum &x) { … }

static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { … }

static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
                     const AlphaNum &x3, const AlphaNum &x4) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d, const AlphaNum &e) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
                   const AlphaNum &g) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
                   const AlphaNum &g, const AlphaNum &h) { … }

std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                   const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
                   const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { … }

// It's possible to call StrAppend with a char * pointer that is partway into
// the string we're appending to.  However the results of this are random.
// Therefore, check for this in debug mode.  Use unsigned math so we only have
// to do one comparison.
#define GOOGLE_DCHECK_NO_OVERLAP(dest, src) …

void StrAppend(std::string *result, const AlphaNum &a) { … }

void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) { … }

void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
               const AlphaNum &c) { … }

void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
               const AlphaNum &c, const AlphaNum &d) { … }

int GlobalReplaceSubstring(const std::string &substring,
                           const std::string &replacement, std::string *s) { … }

int CalculateBase64EscapedLen(int input_len, bool do_padding) { … }

// Base64Escape does padding, so this calculation includes padding.
int CalculateBase64EscapedLen(int input_len) { … }

// ----------------------------------------------------------------------
// int Base64Unescape() - base64 decoder
// int Base64Escape() - base64 encoder
// int WebSafeBase64Unescape() - Google's variation of base64 decoder
// int WebSafeBase64Escape() - Google's variation of base64 encoder
//
// Check out
// http://tools.ietf.org/html/rfc2045 for formal description, but what we
// care about is that...
//   Take the encoded stuff in groups of 4 characters and turn each
//   character into a code 0 to 63 thus:
//           A-Z map to 0 to 25
//           a-z map to 26 to 51
//           0-9 map to 52 to 61
//           +(- for WebSafe) maps to 62
//           /(_ for WebSafe) maps to 63
//   There will be four numbers, all less than 64 which can be represented
//   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
//   Arrange the 6 digit binary numbers into three bytes as such:
//   aaaaaabb bbbbcccc ccdddddd
//   Equals signs (one or two) are used at the end of the encoded block to
//   indicate that the text was not an integer multiple of three bytes long.
// ----------------------------------------------------------------------

int Base64UnescapeInternal(const char *src_param, int szsrc,
                           char *dest, int szdest,
                           const signed char* unbase64) { … }

// The arrays below were generated by the following code
// #include <sys/time.h>
// #include <stdlib.h>
// #include <string.h>
// #include <stdio.h>
// main()
// {
//   static const char Base64[] =
//     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//   const char *pos;
//   int idx, i, j;
//   printf("    ");
//   for (i = 0; i < 255; i += 8) {
//     for (j = i; j < i + 8; j++) {
//       pos = strchr(Base64, j);
//       if ((pos == nullptr) || (j == 0))
//         idx = -1;
//       else
//         idx = pos - Base64;
//       if (idx == -1)
//         printf(" %2d,     ", idx);
//       else
//         printf(" %2d/""*%c*""/,", idx, j);
//     }
//     printf("\n    ");
//   }
// }
//
// where the value of "Base64[]" was replaced by one of the base-64 conversion
// tables from the functions below.
static const signed char kUnBase64[] = …;
static const signed char kUnWebSafeBase64[] = …;

int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { … }

static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest,
                                   const signed char *unbase64) { … }

bool Base64Unescape(StringPiece src, std::string *dest) { … }

bool WebSafeBase64Unescape(StringPiece src, std::string *dest) { … }

int Base64EscapeInternal(const unsigned char *src, int szsrc,
                         char *dest, int szdest, const char *base64,
                         bool do_padding) { … }

static const char kBase64Chars[] = …;

static const char kWebSafeBase64Chars[] = …;

int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { … }
int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
                        int szdest, bool do_padding) { … }

void Base64EscapeInternal(const unsigned char *src, int szsrc,
                          std::string *dest, bool do_padding,
                          const char *base64_chars) { … }

void Base64Escape(const unsigned char *src, int szsrc, std::string *dest,
                  bool do_padding) { … }

void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest,
                         bool do_padding) { … }

void Base64Escape(StringPiece src, std::string *dest) { … }

void WebSafeBase64Escape(StringPiece src, std::string *dest) { … }

void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) { … }

// Helper to append a Unicode code point to a string as UTF8, without bringing
// in any external dependencies.
int EncodeAsUTF8Char(uint32_t code_point, char* output) { … }

// Table of UTF-8 character lengths, based on first byte
static const unsigned char kUTF8LenTbl[256] = …;

// Return length of a single UTF-8 source character
int UTF8FirstLetterNumBytes(const char* src, int len) { … }

// ----------------------------------------------------------------------
// CleanStringLineEndings()
//   Clean up a multi-line string to conform to Unix line endings.
//   Reads from src and appends to dst, so usually dst should be empty.
//
//   If there is no line ending at the end of a non-empty string, it can
//   be added automatically.
//
//   Four different types of input are correctly handled:
//
//     - Unix/Linux files: line ending is LF: pass through unchanged
//
//     - DOS/Windows files: line ending is CRLF: convert to LF
//
//     - Legacy Mac files: line ending is CR: convert to LF
//
//     - Garbled files: random line endings: convert gracefully
//                      lonely CR, lonely LF, CRLF: convert to LF
//
//   @param src The multi-line string to convert
//   @param dst The converted string is appended to this string
//   @param auto_end_last_line Automatically terminate the last line
//
//   Limitations:
//
//     This does not do the right thing for CRCRLF files created by
//     broken programs that do another Unix->DOS conversion on files
//     that are already in CRLF format.  For this, a two-pass approach
//     brute-force would be needed that
//
//       (1) determines the presence of LF (first one is ok)
//       (2) if yes, removes any CR, else convert every CR to LF

void CleanStringLineEndings(const std::string &src, std::string *dst,
                            bool auto_end_last_line) { … }

void CleanStringLineEndings(std::string *str, bool auto_end_last_line) { … }

namespace internal {

// ----------------------------------------------------------------------
// NoLocaleStrtod()
//   This code will make you cry.
// ----------------------------------------------------------------------

namespace {

// Returns a string identical to *input except that the character pointed to
// by radix_pos (which should be '.') is replaced with the locale-specific
// radix character.
std::string LocalizeRadix(const char *input, const char *radix_pos) { … }

}  // namespace

double NoLocaleStrtod(const char *str, char **endptr) { … }

}  // namespace internal

}  // namespace protobuf
}  // namespace google
chromium/third_party/protobuf/src/google/protobuf/stubs/strutil.cc