// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: [email protected] (Dick Sites) // #include "getonescriptspan.h" #include <string.h> #include "fixunicodevalue.h" #include "port.h" #include "utf8acceptinterchange.h" #include "utf8repl_lettermarklower.h" #include "utf8prop_lettermarkscriptnum.h" #include "utf8scannot_lettermarkspecial.h" #include "utf8statetable.h" namespace chrome_lang_id { namespace CLD2 { // Alphabetical order for binary search, from // generated_entities.cc extern const int kNameToEntitySize; extern const CharIntPair kNameToEntity[]; static const char kSpecialSymbol[256] = …; #define LT … #define GT … #define EX … #define HY … #define QU … #define AP … #define SL … #define S_ … #define C_ … #define R_ … #define I_ … #define P_ … #define T_ … #define Y_ … #define L_ … #define E_ … #define CR … #define NL … #define PL … #define xx … // Map byte to one of ~20 interesting categories for cheap tag parsing static const uint8 kCharToSub[256] = …; #undef LT #undef GT #undef EX #undef HY #undef QU #undef AP #undef SL #undef S_ #undef C_ #undef R_ #undef I_ #undef P_ #undef T_ #undef Y_ #undef L_ #undef E_ #undef CR #undef NL #undef PL #undef xx #define OK … #define X_ … static const int kMaxExitStateLettersMarksOnly = …; static const int kMaxExitStateAllText = …; // State machine to do cheap parse of non-letter strings incl. tags // advances <tag> // | | // advances <tag> ... </tag> for <script> <style> // | | // advances <!-- ... <tag> ... --> // | | // advances <tag // || (0) // advances <tag <tag2> // || (0) // // We start in state [0] at a non-letter and make at least one transition // When scanning for just letters, arriving back at state [0] or [1] exits // the state machine. // When scanning for any non-tag text, arriving at state [2] also exits static const uint8 kTagParseTbl_0[] = …; #undef OK #undef X_ enum { … }; // Debugging. Not thread safe. static char gDisplayPiece[32]; const uint8 gCharlen[16] = …; char* DisplayPiece(const char* next_byte_, int byte_length_) { … } // runetochar copies (encodes) one rune, pointed to by r, to at most // UTFmax bytes starting at s and returns the number of bytes generated. int runetochar(char *str, const char32 *rune) { … } // Useful for converting an entity to an ascii value. // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; int LookupEntity(const char* entity_name, int entity_len) { … } bool ascii_isdigit(char c) { … } bool ascii_isxdigit(char c) { … } bool ascii_isalnum(char c) { … } int hex_digit_to_int(char c) { … } static int32 strto32_base10(const char* nptr, const char* limit, const char **endptr) { … } static int32 strto32_base16(const char* nptr, const char* limit, const char **endptr) { … } // Unescape the current character pointed to by src. SETS the number // of chars read for the conversion (in UTF8). If src isn't a valid entity, // just consume the & and RETURN -1. If src doesn't point to & -- which it // should -- set src_consumed to 0 and RETURN -1. int ReadEntity(const char* src, int srcn, int* src_consumed) { … } // Src points to '&' // Writes entity value to dst. Returns take(src), put(dst) byte counts void EntityToBuffer(const char* src, int len, char* dst, int* tlen, int* plen) { … } // Returns true if character is < > or &, none of which are letters bool inline IsSpecial(char c) { … } // Quick Skip to next letter or < > & or to end of string (eos) // Always return is_letter for eos int ScanToLetterOrSpecial(const char* src, int len) { … } // src points to non-letter, such as tag-opening '<' // Return length from here to next possible letter // On another < before >, return 1 // advances <tag> // | | // advances <tag> ... </tag> for <script> <style> // | | // advances <!-- ... <tag> ... --> // | | // advances <tag // | | end of string // advances <tag <tag2> // || int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { … } // Returns mid if key found in lo <= mid < hi, else -1 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { … } // Returns the length in bytes of the prefix of src that is all // interchange valid UTF-8 int SpanInterchangeValid(const char* src, int byte_length) { … } ScriptScanner::ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text) : … { … } // Extended version to allow spans of any non-tag text and spans of mixed script ScriptScanner::ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, bool any_text, bool any_script) : … { … } ScriptScanner::~ScriptScanner() { … } // Get to the first real non-tag letter or entity that is a letter // Sets script of that letter // Return len if no more letters int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { … } // These are for ASCII-only tag names // Compare one letter uplow to c, ignoring case of uplowp inline bool EqCase(char uplow, char c) { … } // These are for ASCII-only tag names // Return true for space / < > etc. all less than 0x40 inline bool NeqLetter(char c) { … } // These are for ASCII-only tag names // Return true for space \n false for \r inline bool WS(char c) { … } // Canonical CR or LF static const char LF = …; // The naive loop scans from next_byte_ to script_buffer_ until full. // But this can leave an awkward hard-to-identify short fragment at the // end of the input. We would prefer to make the next-to-last fragment // shorter and the last fragment longer. // Copy next run of non-tag characters to buffer [NUL terminated] // This just replaces tags with space or \n and removes entities. // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences // including \r or \n are replaced by \n. All other tags and skipped text // are replaced with ASCII space. // // Buffer ALWAYS has leading space and trailing space space space NUL bool ScriptScanner::GetOneTextSpan(LangSpan* span) { … } // Copy next run of same-script non-tag letters to buffer [NUL terminated] // Buffer ALWAYS has leading space and trailing space space space NUL bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { … } // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase // List changes with each version of Unicode, so just always lowercase // Unicode 6.2.0: // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN void ScriptScanner::LowerScriptSpan(LangSpan* span) { … } // Copy next run of same-script non-tag letters to buffer [NUL terminated] // Force Latin, Cyrillic, Greek scripts to be lowercase // Buffer ALWAYS has leading space and trailing space space space NUL bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { … } // Maps byte offset in most recent GetOneScriptSpan/Lower // span->text [0..text_bytes] into an additional byte offset from // span->offset, to get back to corresponding text in the original // input buffer. // text_offset must be the first byte // of a UTF-8 character, or just beyond the last character. Normally this // routine is called with the first byte of an interesting range and // again with the first byte of the following range. int ScriptScanner::MapBack(int text_offset) { … } // Gets lscript number for letters; always returns // 0 (common script) for non-letters int GetUTF8LetterScriptNum(const char* src) { … } } // namespace CLD2 } // namespace chrome_lang_id