// Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Author: [email protected] (Jim Meehan) #include <google/protobuf/stubs/common.h> #include <google/protobuf/stubs/stringpiece.h> namespace google { namespace protobuf { namespace internal { // These four-byte entries compactly encode how many bytes 0..255 to delete // in making a string replacement, how many bytes to add 0..255, and the offset // 0..64k-1 of the replacement string in remap_string. struct RemapEntry { … }; // Exit type codes for state tables. All but the first get stuffed into // signed one-byte entries. The first is only generated by executable code. // To distinguish from next-state entries, these must be contiguous and // all <= kExitNone ExitReason; // This struct represents one entire state table. The three initialized byte // areas are state_table, remap_base, and remap_string. state0 and state0_size // give the byte offset and length within state_table of the initial state -- // table lookups are expected to start and end in this state, but for // truncated UTF-8 strings, may end in a different state. These allow a quick // test for that condition. entry_shift is 8 for tables subscripted by a full // byte value and 6 for space-optimized tables subscripted by only six // significant bits in UTF-8 continuation bytes. UTF8StateMachineObj; UTF8ScanObj; #define X__ … #define RJ_ … #define S1_ … #define S2_ … #define S3_ … #define S21 … #define S31 … #define S32 … #define T1_ … #define T2_ … #define S11 … #define SP_ … #define D__ … #define RJA … // Entire table has 9 state blocks of 256 entries each static const unsigned int utf8acceptnonsurrogates_STATE0 = …; // state[0] static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = …; // =[1] static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = …; static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = …; static const unsigned int utf8acceptnonsurrogates_SHIFT = …; static const unsigned int utf8acceptnonsurrogates_BYTES = …; static const unsigned int utf8acceptnonsurrogates_LOSUB = …; static const unsigned int utf8acceptnonsurrogates_HIADD = …; static const uint8_t utf8acceptnonsurrogates[] = …; // Remap base[0] = (del, add, string_offset) static const RemapEntry utf8acceptnonsurrogates_remap_base[] = …; // Remap string[0] static const unsigned char utf8acceptnonsurrogates_remap_string[] = …; static const unsigned char utf8acceptnonsurrogates_fast[256] = …; static const UTF8ScanObj utf8acceptnonsurrogates_obj = …; #undef X__ #undef RJ_ #undef S1_ #undef S2_ #undef S3_ #undef S21 #undef S31 #undef S32 #undef T1_ #undef T2_ #undef S11 #undef SP_ #undef D__ #undef RJA // Return true if current Tbl pointer is within state0 range // Note that unsigned compare checks both ends of range simultaneously static inline bool InStateZero(const UTF8ScanObj* st, const uint8_t* Tbl) { … } namespace { // Scan a UTF-8 string based on state table. // Always scan complete UTF-8 characters // Set number of bytes scanned. Return reason for exiting int UTF8GenericScan(const UTF8ScanObj* st, const char * str, int str_length, int* bytes_consumed) { … } int UTF8GenericScanFastAscii(const UTF8ScanObj* st, const char * str, int str_length, int* bytes_consumed) { … } // Hack: On some compilers the static tables are initialized at startup. // We can't use them until they are initialized. However, some Protocol // Buffer parsing happens at static init time and may try to validate // UTF-8 strings. Since UTF-8 validation is only used for debugging // anyway, we simply always return success if initialization hasn't // occurred yet. bool module_initialized_ = …; struct InitDetector { … }; InitDetector init_detector; } // namespace bool IsStructurallyValidUTF8(const char* buf, int len) { … } int UTF8SpnStructurallyValid(StringPiece str) { … } // Coerce UTF-8 byte string in src_str to be // a structurally-valid equal-length string by selectively // overwriting illegal bytes with replace_char (typically blank). // replace_char must be legal printable 7-bit Ascii 0x20..0x7e. // src_str is read-only. If any overwriting is needed, a modified byte string // is created in idst, length isrclen. // // Returns pointer to output buffer, isrc if no changes were made, // or idst if some bytes were changed. // // Fast case: all is structurally valid and no byte copying is done. // char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst, const char replace_char) { … } } // namespace internal } // namespace protobuf } // namespace google