structurally_valid.cc | Explore in Territory

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Author: [email protected] (Jim Meehan)

#include <google/protobuf/stubs/common.h>

#include <google/protobuf/stubs/stringpiece.h>

namespace google {
namespace protobuf {
namespace internal {

// These four-byte entries compactly encode how many bytes 0..255 to delete
// in making a string replacement, how many bytes to add 0..255, and the offset
// 0..64k-1 of the replacement string in remap_string.
struct RemapEntry { … };

// Exit type codes for state tables. All but the first get stuffed into
// signed one-byte entries. The first is only generated by executable code.
// To distinguish from next-state entries, these must be contiguous and
// all <= kExitNone
ExitReason;


// This struct represents one entire state table. The three initialized byte
// areas are state_table, remap_base, and remap_string. state0 and state0_size
// give the byte offset and length within state_table of the initial state --
// table lookups are expected to start and end in this state, but for
// truncated UTF-8 strings, may end in a different state. These allow a quick
// test for that condition. entry_shift is 8 for tables subscripted by a full
// byte value and 6 for space-optimized tables subscripted by only six
// significant bits in UTF-8 continuation bytes.
UTF8StateMachineObj;

UTF8ScanObj;

#define X__ …
#define RJ_ …
#define S1_ …
#define S2_ …
#define S3_ …
#define S21 …
#define S31 …
#define S32 …
#define T1_ …
#define T2_ …
#define S11 …
#define SP_ …
#define D__ …
#define RJA …

//  Entire table has 9 state blocks of 256 entries each
static const unsigned int utf8acceptnonsurrogates_STATE0 = …;     // state[0]
static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = …;  // =[1]
static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = …;
static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = …;
static const unsigned int utf8acceptnonsurrogates_SHIFT = …;
static const unsigned int utf8acceptnonsurrogates_BYTES = …;
static const unsigned int utf8acceptnonsurrogates_LOSUB = …;
static const unsigned int utf8acceptnonsurrogates_HIADD = …;

static const uint8_t utf8acceptnonsurrogates[] = …;

// Remap base[0] = (del, add, string_offset)
static const RemapEntry utf8acceptnonsurrogates_remap_base[] = …;

// Remap string[0]
static const unsigned char utf8acceptnonsurrogates_remap_string[] = …;

static const unsigned char utf8acceptnonsurrogates_fast[256] = …;

static const UTF8ScanObj utf8acceptnonsurrogates_obj = …;


#undef X__
#undef RJ_
#undef S1_
#undef S2_
#undef S3_
#undef S21
#undef S31
#undef S32
#undef T1_
#undef T2_
#undef S11
#undef SP_
#undef D__
#undef RJA

// Return true if current Tbl pointer is within state0 range
// Note that unsigned compare checks both ends of range simultaneously
static inline bool InStateZero(const UTF8ScanObj* st, const uint8_t* Tbl) { … }

namespace {

// Scan a UTF-8 string based on state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
                    const char * str,
                    int str_length,
                    int* bytes_consumed) { … }

int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
                    const char * str,
                    int str_length,
                    int* bytes_consumed) { … }

// Hack:  On some compilers the static tables are initialized at startup.
//   We can't use them until they are initialized.  However, some Protocol
//   Buffer parsing happens at static init time and may try to validate
//   UTF-8 strings.  Since UTF-8 validation is only used for debugging
//   anyway, we simply always return success if initialization hasn't
//   occurred yet.

bool module_initialized_ = …;

struct InitDetector { … };
InitDetector init_detector;

}  // namespace

bool IsStructurallyValidUTF8(const char* buf, int len) { … }

int UTF8SpnStructurallyValid(StringPiece str) { … }

// Coerce UTF-8 byte string in src_str to be
// a structurally-valid equal-length string by selectively
// overwriting illegal bytes with replace_char (typically blank).
// replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
// src_str is read-only. If any overwriting is needed, a modified byte string
// is created in idst, length isrclen.
//
// Returns pointer to output buffer, isrc if no changes were made,
//  or idst if some bytes were changed.
//
// Fast case: all is structurally valid and no byte copying is done.
//
char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst,
                                    const char replace_char) { … }

}  // namespace internal
}  // namespace protobuf
}  // namespace google
chromium/third_party/protobuf/src/google/protobuf/stubs/structurally_valid.cc