// Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // from google3/strings/strutil.cc #include <google/protobuf/stubs/strutil.h> #include <errno.h> #include <float.h> // FLT_DIG and DBL_DIG #include <limits.h> #include <stdio.h> #include <cmath> #include <iterator> #include <limits> #include <google/protobuf/stubs/logging.h> #include <google/protobuf/stubs/stl_util.h> #ifdef _WIN32 // MSVC has only _snprintf, not snprintf. // // MinGW has both snprintf and _snprintf, but they appear to be different // functions. The former is buggy. When invoked like so: // char buffer[32]; // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f); // it prints "1.23000e+10". This is plainly wrong: %g should never print // trailing zeros after the decimal point. For some reason this bug only // occurs with some input values, not all. In any case, _snprintf does the // right thing, so we use it. #define snprintf … #endif namespace google { namespace protobuf { // These are defined as macros on some platforms. #undef them so that we can // redefine them. #undef isxdigit #undef isprint // The definitions of these in ctype.h change based on locale. Since our // string manipulation is all in relation to the protocol buffer and C++ // languages, we always want to use the C locale. So, we re-define these // exactly as we want them. inline bool isxdigit(char c) { … } inline bool isprint(char c) { … } // ---------------------------------------------------------------------- // ReplaceCharacters // Replaces any occurrence of the character 'remove' (or the characters // in 'remove') with the character 'replacewith'. // ---------------------------------------------------------------------- void ReplaceCharacters(std::string *s, const char *remove, char replacewith) { … } void StripWhitespace(std::string *str) { … } // ---------------------------------------------------------------------- // StringReplace() // Replace the "old" pattern with the "new" pattern in a string, // and append the result to "res". If replace_all is false, // it only replaces the first instance of "old." // ---------------------------------------------------------------------- void StringReplace(const std::string &s, const std::string &oldsub, const std::string &newsub, bool replace_all, std::string *res) { … } // ---------------------------------------------------------------------- // StringReplace() // Give me a string and two patterns "old" and "new", and I replace // the first instance of "old" in the string with "new", if it // exists. If "global" is true; call this repeatedly until it // fails. RETURN a new string, regardless of whether the replacement // happened or not. // ---------------------------------------------------------------------- std::string StringReplace(const std::string &s, const std::string &oldsub, const std::string &newsub, bool replace_all) { … } // ---------------------------------------------------------------------- // SplitStringUsing() // Split a string using a character delimiter. Append the components // to 'result'. // // Note: For multi-character delimiters, this routine will split on *ANY* of // the characters in the string, not the entire string as a single delimiter. // ---------------------------------------------------------------------- template <typename ITR> static inline void SplitStringToIteratorUsing(StringPiece full, const char *delim, ITR &result) { … } void SplitStringUsing(StringPiece full, const char *delim, std::vector<std::string> *result) { … } // Split a string using a character delimiter. Append the components // to 'result'. If there are consecutive delimiters, this function // will return corresponding empty strings. The string is split into // at most the specified number of pieces greedily. This means that the // last piece may possibly be split further. To split into as many pieces // as possible, specify 0 as the number of pieces. // // If "full" is the empty string, yields an empty string as the only value. // // If "pieces" is negative for some reason, it returns the whole string // ---------------------------------------------------------------------- template <typename ITR> static inline void SplitStringToIteratorAllowEmpty(StringPiece full, const char *delim, int pieces, ITR &result) { … } void SplitStringAllowEmpty(StringPiece full, const char *delim, std::vector<std::string> *result) { … } // ---------------------------------------------------------------------- // JoinStrings() // This merges a vector of string components with delim inserted // as separaters between components. // // ---------------------------------------------------------------------- template <class ITERATOR> static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end, const char *delim, std::string *result) { … } void JoinStrings(const std::vector<std::string> &components, const char *delim, std::string *result) { … } // ---------------------------------------------------------------------- // UnescapeCEscapeSequences() // This does all the unescaping that C does: \ooo, \r, \n, etc // Returns length of resulting string. // The implementation of \x parses any positive number of hex digits, // but it is an error if the value requires more than 8 bits, and the // result is truncated to 8 bits. // // The second call stores its errors in a supplied string vector. // If the string vector pointer is nullptr, it reports the errors with LOG(). // ---------------------------------------------------------------------- #define IS_OCTAL_DIGIT(c) … // Protocol buffers doesn't ever care about errors, but I don't want to remove // the code. #define LOG_STRING(LEVEL, VECTOR) … int UnescapeCEscapeSequences(const char* source, char* dest) { … } int UnescapeCEscapeSequences(const char *source, char *dest, std::vector<std::string> *errors) { … } // ---------------------------------------------------------------------- // UnescapeCEscapeString() // This does the same thing as UnescapeCEscapeSequences, but creates // a new string. The caller does not need to worry about allocating // a dest buffer. This should be used for non performance critical // tasks such as printing debug messages. It is safe for src and dest // to be the same. // // The second call stores its errors in a supplied string vector. // If the string vector pointer is nullptr, it reports the errors with LOG(). // // In the first and second calls, the length of dest is returned. In the // the third call, the new string is returned. // ---------------------------------------------------------------------- int UnescapeCEscapeString(const std::string &src, std::string *dest) { … } int UnescapeCEscapeString(const std::string &src, std::string *dest, std::vector<std::string> *errors) { … } std::string UnescapeCEscapeString(const std::string &src) { … } // ---------------------------------------------------------------------- // CEscapeString() // CHexEscapeString() // Copies 'src' to 'dest', escaping dangerous characters using // C-style escape sequences. This is very useful for preparing query // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses // hexadecimal rather than octal sequences. // Returns the number of bytes written to 'dest' (not including the \0) // or -1 if there was insufficient space. // // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped. // ---------------------------------------------------------------------- int CEscapeInternal(const char* src, int src_len, char* dest, int dest_len, bool use_hex, bool utf8_safe) { … } // Calculates the length of the C-style escaped version of 'src'. // Assumes that non-printable characters are escaped using octal sequences, and // that UTF-8 bytes are not handled specially. static inline size_t CEscapedLength(StringPiece src) { … } // ---------------------------------------------------------------------- // Escapes 'src' using C-style escape sequences, and appends the escaped string // to 'dest'. This version is faster than calling CEscapeInternal as it computes // the required space using a lookup table, and also does not do any special // handling for Hex or UTF-8 characters. // ---------------------------------------------------------------------- void CEscapeAndAppend(StringPiece src, std::string *dest) { … } std::string CEscape(const std::string &src) { … } namespace strings { std::string Utf8SafeCEscape(const std::string &src) { … } std::string CHexEscape(const std::string &src) { … } } // namespace strings // ---------------------------------------------------------------------- // strto32_adaptor() // strtou32_adaptor() // Implementation of strto[u]l replacements that have identical // overflow and underflow characteristics for both ILP-32 and LP-64 // platforms, including errno preservation in error-free calls. // ---------------------------------------------------------------------- int32_t strto32_adaptor(const char *nptr, char **endptr, int base) { … } uint32_t strtou32_adaptor(const char *nptr, char **endptr, int base) { … } inline bool safe_parse_sign(std::string *text /*inout*/, bool *negative_ptr /*output*/) { … } template <typename IntType> bool safe_parse_positive_int(std::string text, IntType *value_p) { … } template <typename IntType> bool safe_parse_negative_int(const std::string &text, IntType *value_p) { … } template <typename IntType> bool safe_int_internal(std::string text, IntType *value_p) { … } template <typename IntType> bool safe_uint_internal(std::string text, IntType *value_p) { … } // ---------------------------------------------------------------------- // FastIntToBuffer() // FastInt64ToBuffer() // FastHexToBuffer() // FastHex64ToBuffer() // FastHex32ToBuffer() // ---------------------------------------------------------------------- // Offset into buffer where FastInt64ToBuffer places the end of string // null character. Also used by FastInt64ToBufferLeft. static const int kFastInt64ToBufferOffset = …; char *FastInt64ToBuffer(int64_t i, char* buffer) { … } // Offset into buffer where FastInt32ToBuffer places the end of string // null character. Also used by FastInt32ToBufferLeft static const int kFastInt32ToBufferOffset = …; // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the // compiler to generate 32 bit arithmetic instructions. It's much faster, at // least with 32 bit binaries. char *FastInt32ToBuffer(int32_t i, char* buffer) { … } char *FastHexToBuffer(int i, char* buffer) { … } char *InternalFastHexToBuffer(uint64_t value, char* buffer, int num_byte) { … } char *FastHex64ToBuffer(uint64_t value, char* buffer) { … } char *FastHex32ToBuffer(uint32_t value, char* buffer) { … } // ---------------------------------------------------------------------- // FastInt32ToBufferLeft() // FastUInt32ToBufferLeft() // FastInt64ToBufferLeft() // FastUInt64ToBufferLeft() // // Like the Fast*ToBuffer() functions above, these are intended for speed. // Unlike the Fast*ToBuffer() functions, however, these functions write // their output to the beginning of the buffer (hence the name, as the // output is left-aligned). The caller is responsible for ensuring that // the buffer has enough space to hold the output. // // Returns a pointer to the end of the string (i.e. the null character // terminating the string). // ---------------------------------------------------------------------- static const char two_ASCII_digits[100][2] = …; char* FastUInt32ToBufferLeft(uint32_t u, char* buffer) { … } char* FastInt32ToBufferLeft(int32_t i, char* buffer) { … } char* FastUInt64ToBufferLeft(uint64_t u64, char* buffer) { … } char* FastInt64ToBufferLeft(int64_t i, char* buffer) { … } // ---------------------------------------------------------------------- // SimpleItoa() // Description: converts an integer to a string. // // Return value: string // ---------------------------------------------------------------------- std::string SimpleItoa(int i) { … } std::string SimpleItoa(unsigned int i) { … } std::string SimpleItoa(long i) { … } std::string SimpleItoa(unsigned long i) { … } std::string SimpleItoa(long long i) { … } std::string SimpleItoa(unsigned long long i) { … } // ---------------------------------------------------------------------- // SimpleDtoa() // SimpleFtoa() // DoubleToBuffer() // FloatToBuffer() // We want to print the value without losing precision, but we also do // not want to print more digits than necessary. This turns out to be // trickier than it sounds. Numbers like 0.2 cannot be represented // exactly in binary. If we print 0.2 with a very large precision, // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". // On the other hand, if we set the precision too low, we lose // significant digits when printing numbers that actually need them. // It turns out there is no precision value that does the right thing // for all numbers. // // Our strategy is to first try printing with a precision that is never // over-precise, then parse the result with strtod() to see if it // matches. If not, we print again with a precision that will always // give a precise result, but may use more digits than necessary. // // An arguably better strategy would be to use the algorithm described // in "How to Print Floating-Point Numbers Accurately" by Steele & // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, // however, that the following implementation is about as fast as // DMG's code. Furthermore, DMG's code locks mutexes, which means it // will not scale well on multi-core machines. DMG's code is slightly // more accurate (in that it will never use more digits than // necessary), but this is probably irrelevant for most users. // // Rob Pike and Ken Thompson also have an implementation of dtoa() in // third_party/fmt/fltfmt.cc. Their implementation is similar to this // one in that it makes guesses and then uses strtod() to check them. // Their implementation is faster because they use their own code to // generate the digits in the first place rather than use snprintf(), // thus avoiding format string parsing overhead. However, this makes // it considerably more complicated than the following implementation, // and it is embedded in a larger library. If speed turns out to be // an issue, we could re-implement this in terms of their // implementation. // ---------------------------------------------------------------------- std::string SimpleDtoa(double value) { … } std::string SimpleFtoa(float value) { … } static inline bool IsValidFloatChar(char c) { … } void DelocalizeRadix(char* buffer) { … } char* DoubleToBuffer(double value, char* buffer) { … } static int memcasecmp(const char *s1, const char *s2, size_t len) { … } inline bool CaseEqual(StringPiece s1, StringPiece s2) { … } bool safe_strtob(StringPiece str, bool* value) { … } bool safe_strtof(const char* str, float* value) { … } bool safe_strtod(const char* str, double* value) { … } bool safe_strto32(const std::string &str, int32_t *value) { … } bool safe_strtou32(const std::string &str, uint32_t *value) { … } bool safe_strto64(const std::string &str, int64_t *value) { … } bool safe_strtou64(const std::string &str, uint64_t *value) { … } char* FloatToBuffer(float value, char* buffer) { … } namespace strings { AlphaNum::AlphaNum(strings::Hex hex) { … } } // namespace strings // ---------------------------------------------------------------------- // StrCat() // This merges the given strings or integers, with no delimiter. This // is designed to be the fastest possible way to construct a string out // of a mix of raw C strings, C++ strings, and integer values. // ---------------------------------------------------------------------- // Append is merely a version of memcpy that returns the address of the byte // after the area just overwritten. It comes in multiple flavors to minimize // call overhead. static char *Append1(char *out, const AlphaNum &x) { … } static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { … } static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2, const AlphaNum &x3, const AlphaNum &x4) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d, const AlphaNum &e) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, const AlphaNum &g) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, const AlphaNum &g, const AlphaNum &h) { … } std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { … } // It's possible to call StrAppend with a char * pointer that is partway into // the string we're appending to. However the results of this are random. // Therefore, check for this in debug mode. Use unsigned math so we only have // to do one comparison. #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) … void StrAppend(std::string *result, const AlphaNum &a) { … } void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) { … } void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { … } void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, const AlphaNum &d) { … } int GlobalReplaceSubstring(const std::string &substring, const std::string &replacement, std::string *s) { … } int CalculateBase64EscapedLen(int input_len, bool do_padding) { … } // Base64Escape does padding, so this calculation includes padding. int CalculateBase64EscapedLen(int input_len) { … } // ---------------------------------------------------------------------- // int Base64Unescape() - base64 decoder // int Base64Escape() - base64 encoder // int WebSafeBase64Unescape() - Google's variation of base64 decoder // int WebSafeBase64Escape() - Google's variation of base64 encoder // // Check out // http://tools.ietf.org/html/rfc2045 for formal description, but what we // care about is that... // Take the encoded stuff in groups of 4 characters and turn each // character into a code 0 to 63 thus: // A-Z map to 0 to 25 // a-z map to 26 to 51 // 0-9 map to 52 to 61 // +(- for WebSafe) maps to 62 // /(_ for WebSafe) maps to 63 // There will be four numbers, all less than 64 which can be represented // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). // Arrange the 6 digit binary numbers into three bytes as such: // aaaaaabb bbbbcccc ccdddddd // Equals signs (one or two) are used at the end of the encoded block to // indicate that the text was not an integer multiple of three bytes long. // ---------------------------------------------------------------------- int Base64UnescapeInternal(const char *src_param, int szsrc, char *dest, int szdest, const signed char* unbase64) { … } // The arrays below were generated by the following code // #include <sys/time.h> // #include <stdlib.h> // #include <string.h> // #include <stdio.h> // main() // { // static const char Base64[] = // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; // const char *pos; // int idx, i, j; // printf(" "); // for (i = 0; i < 255; i += 8) { // for (j = i; j < i + 8; j++) { // pos = strchr(Base64, j); // if ((pos == nullptr) || (j == 0)) // idx = -1; // else // idx = pos - Base64; // if (idx == -1) // printf(" %2d, ", idx); // else // printf(" %2d/""*%c*""/,", idx, j); // } // printf("\n "); // } // } // // where the value of "Base64[]" was replaced by one of the base-64 conversion // tables from the functions below. static const signed char kUnBase64[] = …; static const signed char kUnWebSafeBase64[] = …; int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { … } static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest, const signed char *unbase64) { … } bool Base64Unescape(StringPiece src, std::string *dest) { … } bool WebSafeBase64Unescape(StringPiece src, std::string *dest) { … } int Base64EscapeInternal(const unsigned char *src, int szsrc, char *dest, int szdest, const char *base64, bool do_padding) { … } static const char kBase64Chars[] = …; static const char kWebSafeBase64Chars[] = …; int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { … } int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest, int szdest, bool do_padding) { … } void Base64EscapeInternal(const unsigned char *src, int szsrc, std::string *dest, bool do_padding, const char *base64_chars) { … } void Base64Escape(const unsigned char *src, int szsrc, std::string *dest, bool do_padding) { … } void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest, bool do_padding) { … } void Base64Escape(StringPiece src, std::string *dest) { … } void WebSafeBase64Escape(StringPiece src, std::string *dest) { … } void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) { … } // Helper to append a Unicode code point to a string as UTF8, without bringing // in any external dependencies. int EncodeAsUTF8Char(uint32_t code_point, char* output) { … } // Table of UTF-8 character lengths, based on first byte static const unsigned char kUTF8LenTbl[256] = …; // Return length of a single UTF-8 source character int UTF8FirstLetterNumBytes(const char* src, int len) { … } // ---------------------------------------------------------------------- // CleanStringLineEndings() // Clean up a multi-line string to conform to Unix line endings. // Reads from src and appends to dst, so usually dst should be empty. // // If there is no line ending at the end of a non-empty string, it can // be added automatically. // // Four different types of input are correctly handled: // // - Unix/Linux files: line ending is LF: pass through unchanged // // - DOS/Windows files: line ending is CRLF: convert to LF // // - Legacy Mac files: line ending is CR: convert to LF // // - Garbled files: random line endings: convert gracefully // lonely CR, lonely LF, CRLF: convert to LF // // @param src The multi-line string to convert // @param dst The converted string is appended to this string // @param auto_end_last_line Automatically terminate the last line // // Limitations: // // This does not do the right thing for CRCRLF files created by // broken programs that do another Unix->DOS conversion on files // that are already in CRLF format. For this, a two-pass approach // brute-force would be needed that // // (1) determines the presence of LF (first one is ok) // (2) if yes, removes any CR, else convert every CR to LF void CleanStringLineEndings(const std::string &src, std::string *dst, bool auto_end_last_line) { … } void CleanStringLineEndings(std::string *str, bool auto_end_last_line) { … } namespace internal { // ---------------------------------------------------------------------- // NoLocaleStrtod() // This code will make you cry. // ---------------------------------------------------------------------- namespace { // Returns a string identical to *input except that the character pointed to // by radix_pos (which should be '.') is replaced with the locale-specific // radix character. std::string LocalizeRadix(const char *input, const char *radix_pos) { … } } // namespace double NoLocaleStrtod(const char *str, char **endptr) { … } } // namespace internal } // namespace protobuf } // namespace google