// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // State Table follower for scanning UTF-8 strings without converting to // 32- or 16-bit Unicode values. // #ifdef COMPILER_MSVC // MSVC warns: warning C4309: 'initializing' : truncation of constant value // But the value is in fact not truncated. 0xFF still comes out 0xFF at // runtime. #pragma warning ( disable : 4309 ) #endif #include "utf8statetable.h" #include <stdint.h> // for uintptr_t #include <string.h> // for NULL, memcpy, memmove #include "integral_types.h" // for uint8, uint32, int8 #include "offsetmap.h" #include "port.h" #include "stringpiece.h" namespace chrome_lang_id { namespace CLD2 { static const int kReplaceAndResumeFlag = …; // Bit in del byte to distinguish // optional next-state field // after replacement text static const int kHtmlPlaintextFlag = …; // Bit in add byte to distinguish // HTML replacement vs. plaintext /** * This code implements a little interpreter for UTF8 state * tables. There are three kinds of quite-similar state tables, * property, scanning, and replacement. Each state in one of * these tables consists of an array of 256 or 64 one-byte * entries. The state is subscripted by an incoming source byte, * and the entry either specifies the next state or specifies an * action. Space-optimized tables have full 256-entry states for * the first byte of a UTF-8 character, but only 64-entry states * for continuation bytes. Space-optimized tables may only be * used with source input that has been checked to be * structurally- (or stronger interchange-) valid. * * A property state table has an unsigned one-byte property for * each possible UTF-8 character. One-byte character properties * are in the state[0] array, while for other lengths the * state[0] array gives the next state, which contains the * property value for two-byte characters or yet another state * for longer ones. The code simply loads the right number of * next-state values, then returns the final byte as property * value. There are no actions specified in property tables. * States are typically shared for multi-byte UTF-8 characters * that all have the same property value. * * A scanning state table has entries that are either a * next-state specifier for bytes that are accepted by the * scanner, or an exit action for the last byte of each * character that is rejected by the scanner. * * Scanning long strings involves a tight loop that picks up one * byte at a time and follows next-state value back to state[0] * for each accepted UTF-8 character. Scanning stops at the end * of the string or at the first character encountered that has * an exit action such as "reject". Timing information is given * below. * * Since so much of Google's text is 7-bit-ASCII values * (approximately 94% of the bytes of web documents), the * scanning interpreter has two speed optimizations. One checks * 8 bytes at a time to see if they are all in the range lo..hi, * as specified in constants in the overall statetable object. * The check involves ORing together four 4-byte values that * overflow into the high bit of some byte when a byte is out of * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This * loop is about 8x faster than the one-byte-at-a-time loop. * * If checking for exit bytes in the 0x00-0x1F and 7F range is * unneeded, an even faster loop just looks at the high bits of * 8 bytes at once, and is about 1.33x faster than the lo..hi * loop. * * Exit from the scanning routines backs up to the first byte of * the rejected character, so the text spanned is always a * complete number of UTF-8 characters. The normal scanning exit * is at the first rejected character, or at the end of the * input text. Scanning also exits on any detected ill-formed * character or at a special do-again action built into some * exit-optimized tables. The do-again action gets back to the * top of the scanning loop to retry eight-byte ASCII scans. It * is typically put into state tables after four seven-bit-ASCII * characters in a row are seen, to allow restarting the fast * scan after some slower processing of multi-byte characters. * * A replacement state table is similar to a scanning state * table but has more extensive actions. The default * byte-at-a-time loop copies one byte from source to * destination and goes to the next state. The replacement * actions overwrite 1-3 bytes of the destination with different * bytes, possibly shortening the output by 1 or 2 bytes. The * replacement bytes come from within the state table, from * dummy states inserted just after any state that contains a * replacement action. This gives a quick address calculation for * the replacement byte(s) and gives some cache locality. * * Additional replacement actions use one or two bytes from * within dummy states to index a side table of more-extensive * replacements. The side table specifies a length of 0..15 * destination bytes to overwrite and a length of 0..127 bytes * to overwrite them with, plus the actual replacement bytes. * * This side table uses one extra bit to specify a pair of * replacements, the first to be used in an HTML context and the * second to be used in a plaintext context. This allows * replacements that are spelled with "<" in the former * context and "<" in the latter. * * The side table also uses an extra bit to specify a non-zero * next state after a replacement. This allows a combination * replacement and state change, used to implement a limited * version of the Boyer-Moore algorithm for multi-character * replacement without backtracking. This is useful when there * are overlapping replacements, such as ch => x and also c => * y, the latter to be used only if the character after c is not * h. in this case, the state[0] table's entry for c would * change c to y and also have a next-state of say n, and the * state[n] entry for h would specify a replacement of the two * bytes yh by x. No backtracking is needed. * * A replacement table may also include the exit actions of a * scanning state table, so some character sequences can * terminate early. * * During replacement, an optional data structure called an * offset map can be updated to reflect each change in length * between source and destination. This offset map can later be * used to map destination-string offsets to corresponding * source-string offsets or vice versa. * * The routines below also have variants in which state-table * entries are all two bytes instead of one byte. This allows * tables with more than 240 total states, but takes up twice as * much space per state. * **/ // All intentional fallthroughs in breakpad are in this file, so define // this macro locally. // If you ever move this to a .h file, make sure it's defined in a // private header file: clang suggests the first macro expanding to // [[clang::fallthrough]] in its diagnostics, so if BP_FALLTHROUGH // is visible in code depending on breakpad, clang would suggest // BP_FALLTHROUGH for code depending on breakpad, instead of the // client code's own fallthrough macro. #if defined(__clang__) #define CLD_FALLTHROUGH … #else #define CLD_FALLTHROUGH #endif // Return true if current Tbl pointer is within state0 range // Note that unsigned compare checks both ends of range simultaneously static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { … } static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, const unsigned short int* Tbl) { … } // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericProperty(const UTF8PropObj* st, const uint8** src, int* srclen) { … } bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { … } // BigOneByte versions are needed for tables > 240 states, but most // won't need the TwoByte versions. // Internally, to next-to-last offset is multiplied by 16 and the last // offset is relative instead of absolute. // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, const uint8** src, int* srclen) { … } // BigOneByte versions are needed for tables > 240 states, but most // won't need the TwoByte versions. bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { … } // TwoByte versions are needed for tables > 240 states // Look up property of one UTF-8 character and advance over it // Return 0 if input length is zero // Return 0 and advance one byte if input is ill-formed uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, const uint8** src, int* srclen) { … } // TwoByte versions are needed for tables > 240 states bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { … } // Approximate speeds on 2.8 GHz Pentium 4: // GenericScan 1-byte loop 300 MB/sec * // GenericScan 4-byte loop 1200 MB/sec // GenericScan 8-byte loop 2400 MB/sec * // GenericScanFastAscii 4-byte loop 3000 MB/sec // GenericScanFastAscii 8-byte loop 3200 MB/sec * // // * Implemented below. FastAscii loop is memory-bandwidth constrained. // Scan a UTF-8 stringpiece based on state table. // Always scan complete UTF-8 characters // Set number of bytes scanned. Return reason for exiting int UTF8GenericScan(const UTF8ScanObj* st, const StringPiece& str, int* bytes_consumed) { … } // Scan a UTF-8 stringpiece based on state table. // Always scan complete UTF-8 characters // Set number of bytes scanned. Return reason for exiting // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid int UTF8GenericScanFastAscii(const UTF8ScanObj* st, const StringPiece& str, int* bytes_consumed) { … } // Hack to change halfwidth katakana to match an old UTF8CharToLower() // Return number of src bytes skipped static int DoSpecialFixup(const unsigned char c, const unsigned char** srcp, const unsigned char* srclimit, unsigned char** dstp, unsigned char* dstlimit) { … } // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece // and doing text replacements. // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below // Needs caller to loop on kExitDoAgain static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap) { … } // TwoByte versions are needed for tables > 240 states, such // as the table for full Unicode 4.1 canonical + compatibility mapping // Scan a UTF-8 stringpiece based on state table with two-byte entries, // copying to output stringpiece // and doing text replacements. // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below // Needs caller to loop on kExitDoAgain static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap) { … } // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece // and doing text replacements. // Also writes an optional OffsetMap. Pass NULL to skip writing one. // Always scan complete UTF-8 characters // Set number of bytes consumed from input, number filled to output. // Return reason for exiting int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap) { … } // Older version without offsetmap int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed) { … } // Older version without is_plain_text or offsetmap int UTF8GenericReplace(const UTF8ReplaceObj* st, const StringPiece& istr, StringPiece& ostr, int* bytes_consumed, int* bytes_filled, int* chars_changed) { … } // Scan a UTF-8 stringpiece based on state table with two-byte entries, // copying to output stringpiece // and doing text replacements. // Also writes an optional OffsetMap. Pass NULL to skip writing one. // Always scan complete UTF-8 characters // Set number of bytes consumed from input, number filled to output. // Return reason for exiting int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed, OffsetMap* offsetmap) { … } // Older version without offsetmap int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, bool is_plain_text, int* bytes_consumed, int* bytes_filled, int* chars_changed) { … } // Older version without is_plain_text or offsetmap int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, const StringPiece& istr, StringPiece& ostr, int* bytes_consumed, int* bytes_filled, int* chars_changed) { … } // Adjust a stringpiece to encompass complete UTF-8 characters. // The data pointer will be increased by 0..3 bytes to get to a character // boundary, and the length will then be decreased by 0..3 bytes // to encompass the last complete character. void UTF8TrimToChars(StringPiece* istr) { … } } // End namespace CLD2 } // End namespace chrome_lang_id