utf8statetable.cc | Explore in Territory

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// State Table follower for scanning UTF-8 strings without converting to
// 32- or 16-bit Unicode values.
//

#ifdef COMPILER_MSVC
// MSVC warns: warning C4309: 'initializing' : truncation of constant value
// But the value is in fact not truncated.  0xFF still comes out 0xFF at
// runtime.
#pragma warning ( disable : 4309 )
#endif

#include "utf8statetable.h"

#include <stdint.h>                     // for uintptr_t
#include <string.h>                     // for NULL, memcpy, memmove

#include "integral_types.h" // for uint8, uint32, int8
#include "offsetmap.h"
#include "port.h"
#include "stringpiece.h"

namespace chrome_lang_id {
namespace CLD2 {

static const int kReplaceAndResumeFlag = …; // Bit in del byte to distinguish
                                               // optional next-state field
                                               // after replacement text
static const int kHtmlPlaintextFlag = …;    // Bit in add byte to distinguish
                                               // HTML replacement vs. plaintext

/**
 * This code implements a little interpreter for UTF8 state
 * tables. There are three kinds of quite-similar state tables,
 * property, scanning, and replacement. Each state in one of
 * these tables consists of an array of 256 or 64 one-byte
 * entries. The state is subscripted by an incoming source byte,
 * and the entry either specifies the next state or specifies an
 * action. Space-optimized tables have full 256-entry states for
 * the first byte of a UTF-8 character, but only 64-entry states
 * for continuation bytes. Space-optimized tables may only be
 * used with source input that has been checked to be
 * structurally- (or stronger interchange-) valid.
 *
 * A property state table has an unsigned one-byte property for
 * each possible UTF-8 character. One-byte character properties
 * are in the state[0] array, while for other lengths the
 * state[0] array gives the next state, which contains the
 * property value for two-byte characters or yet another state
 * for longer ones. The code simply loads the right number of
 * next-state values, then returns the final byte as property
 * value. There are no actions specified in property tables.
 * States are typically shared for multi-byte UTF-8 characters
 * that all have the same property value.
 *
 * A scanning state table has entries that are either a
 * next-state specifier for bytes that are accepted by the
 * scanner, or an exit action for the last byte of each
 * character that is rejected by the scanner.
 *
 * Scanning long strings involves a tight loop that picks up one
 * byte at a time and follows next-state value back to state[0]
 * for each accepted UTF-8 character. Scanning stops at the end
 * of the string or at the first character encountered that has
 * an exit action such as "reject". Timing information is given
 * below.
 *
 * Since so much of Google's text is 7-bit-ASCII values
 * (approximately 94% of the bytes of web documents), the
 * scanning interpreter has two speed optimizations. One checks
 * 8 bytes at a time to see if they are all in the range lo..hi,
 * as specified in constants in the overall statetable object.
 * The check involves ORing together four 4-byte values that
 * overflow into the high bit of some byte when a byte is out of
 * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
 * loop is about 8x faster than the one-byte-at-a-time loop.
 *
 * If checking for exit bytes in the 0x00-0x1F and 7F range is
 * unneeded, an even faster loop just looks at the high bits of
 * 8 bytes at once, and is about 1.33x faster than the lo..hi
 * loop.
 *
 * Exit from the scanning routines backs up to the first byte of
 * the rejected character, so the text spanned is always a
 * complete number of UTF-8 characters. The normal scanning exit
 * is at the first rejected character, or at the end of the
 * input text. Scanning also exits on any detected ill-formed
 * character or at a special do-again action built into some
 * exit-optimized tables. The do-again action gets back to the
 * top of the scanning loop to retry eight-byte ASCII scans. It
 * is typically put into state tables after four seven-bit-ASCII
 * characters in a row are seen, to allow restarting the fast
 * scan after some slower processing of multi-byte characters.
 *
 * A replacement state table is similar to a scanning state
 * table but has more extensive actions. The default
 * byte-at-a-time loop copies one byte from source to
 * destination and goes to the next state. The replacement
 * actions overwrite 1-3 bytes of the destination with different
 * bytes, possibly shortening the output by 1 or 2 bytes. The
 * replacement bytes come from within the state table, from
 * dummy states inserted just after any state that contains a
 * replacement action. This gives a quick address calculation for
 * the replacement byte(s) and gives some cache locality.
 *
 * Additional replacement actions use one or two bytes from
 * within dummy states to index a side table of more-extensive
 * replacements. The side table specifies a length of 0..15
 * destination bytes to overwrite and a length of 0..127 bytes
 * to overwrite them with, plus the actual replacement bytes.
 *
 * This side table uses one extra bit to specify a pair of
 * replacements, the first to be used in an HTML context and the
 * second to be used in a plaintext context. This allows
 * replacements that are spelled with "&lt;" in the former
 * context and "<" in the latter.
 *
 * The side table also uses an extra bit to specify a non-zero
 * next state after a replacement. This allows a combination
 * replacement and state change, used to implement a limited
 * version of the Boyer-Moore algorithm for multi-character
 * replacement without backtracking. This is useful when there
 * are overlapping replacements, such as ch => x and also c =>
 * y, the latter to be used only if the character after c is not
 * h. in this case, the state[0] table's entry for c would
 * change c to y and also have a next-state of say n, and the
 * state[n] entry for h would specify a replacement of the two
 * bytes yh by x. No backtracking is needed.
 *
 * A replacement table may also include the exit actions of a
 * scanning state table, so some character sequences can
 * terminate early.
 *
 * During replacement, an optional data structure called an
 * offset map can be updated to reflect each change in length
 * between source and destination. This offset map can later be
 * used to map destination-string offsets to corresponding
 * source-string offsets or vice versa.
 *
 * The routines below also have variants in which state-table
 * entries are all two bytes instead of one byte. This allows
 * tables with more than 240 total states, but takes up twice as
 * much space per state.
 *
**/

// All intentional fallthroughs in breakpad are in this file, so define
// this macro locally.
// If you ever move this to a .h file, make sure it's defined in a
// private header file: clang suggests the first macro expanding to
// [[clang::fallthrough]] in its diagnostics, so if BP_FALLTHROUGH
// is visible in code depending on breakpad, clang would suggest
// BP_FALLTHROUGH for code depending on breakpad, instead of the
// client code's own fallthrough macro.
#if defined(__clang__)
#define CLD_FALLTHROUGH …
#else
#define CLD_FALLTHROUGH
#endif

// Return true if current Tbl pointer is within state0 range
// Note that unsigned compare checks both ends of range simultaneously
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { … }

static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
                                 const unsigned short int* Tbl) { … }

// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericProperty(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen) { … }

bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { … }


// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.
// Internally, to next-to-last offset is multiplied by 16 and the last
// offset is relative instead of absolute.
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
                          const uint8** src,
                          int* srclen) { … }

// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { … }


// TwoByte versions are needed for tables > 240 states
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
                          const uint8** src,
                          int* srclen) { … }

// TwoByte versions are needed for tables > 240 states
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { … }


// Approximate speeds on 2.8 GHz Pentium 4:
//   GenericScan 1-byte loop           300 MB/sec *
//   GenericScan 4-byte loop          1200 MB/sec
//   GenericScan 8-byte loop          2400 MB/sec *
//   GenericScanFastAscii 4-byte loop 3000 MB/sec
//   GenericScanFastAscii 8-byte loop 3200 MB/sec *
//
// * Implemented below. FastAscii loop is memory-bandwidth constrained.

// Scan a UTF-8 stringpiece based on state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
                    const StringPiece& str,
                    int* bytes_consumed) { … }

// Scan a UTF-8 stringpiece based on state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
                    const StringPiece& str,
                    int* bytes_consumed) { … }

// Hack to change halfwidth katakana to match an old UTF8CharToLower()

// Return number of src bytes skipped
static int DoSpecialFixup(const unsigned char c,
                    const unsigned char** srcp, const unsigned char* srclimit,
                    unsigned char** dstp, unsigned char* dstlimit) { … }


// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
//   and doing text replacements.
// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
//   Needs caller to loop on kExitDoAgain
static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed,
                    OffsetMap* offsetmap) { … }

// TwoByte versions are needed for tables > 240 states, such
// as the table for full Unicode 4.1 canonical + compatibility mapping

// Scan a UTF-8 stringpiece based on state table with two-byte entries,
//   copying to output stringpiece
//   and doing text replacements.
// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
//   Needs caller to loop on kExitDoAgain
static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed,
                    OffsetMap* offsetmap) { … }


// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
//   and doing text replacements.
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
int UTF8GenericReplace(const UTF8ReplaceObj* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed,
                    OffsetMap* offsetmap) { … }

// Older version without offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed) { … }

// Older version without is_plain_text or offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed) { … }

// Scan a UTF-8 stringpiece based on state table with two-byte entries,
//   copying to output stringpiece
//   and doing text replacements.
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed,
                    OffsetMap* offsetmap) { … }

// Older version without offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    bool is_plain_text,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed) { … }

// Older version without is_plain_text or offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                    const StringPiece& istr,
                    StringPiece& ostr,
                    int* bytes_consumed,
                    int* bytes_filled,
                    int* chars_changed) { … }



// Adjust a stringpiece to encompass complete UTF-8 characters.
// The data pointer will be increased by 0..3 bytes to get to a character
// boundary, and the length will then be decreased by 0..3 bytes
// to encompass the last complete character.
void UTF8TrimToChars(StringPiece* istr) { … }

}       // End namespace CLD2
}       // End namespace chrome_lang_id
chromium/third_party/cld_3/src/src/script_span/utf8statetable.cc