getonescriptspan.cc | Explore in Territory

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: [email protected] (Dick Sites)
//


#include "getonescriptspan.h"

#include <string.h>

#include "fixunicodevalue.h"
#include "port.h"
#include "utf8acceptinterchange.h"
#include "utf8repl_lettermarklower.h"
#include "utf8prop_lettermarkscriptnum.h"
#include "utf8scannot_lettermarkspecial.h"
#include "utf8statetable.h"

namespace chrome_lang_id {
namespace CLD2 {

// Alphabetical order for binary search, from
// generated_entities.cc
extern const int kNameToEntitySize;
extern const CharIntPair kNameToEntity[];

static const char kSpecialSymbol[256] = …;



#define LT …
#define GT …
#define EX …
#define HY …
#define QU …
#define AP …
#define SL …
#define S_ …
#define C_ …
#define R_ …
#define I_ …
#define P_ …
#define T_ …
#define Y_ …
#define L_ …
#define E_ …
#define CR …
#define NL …
#define PL …
#define xx …

// Map byte to one of ~20 interesting categories for cheap tag parsing
static const uint8 kCharToSub[256] = …;

#undef LT
#undef GT
#undef EX
#undef HY
#undef QU
#undef AP
#undef SL
#undef S_
#undef C_
#undef R_
#undef I_
#undef P_
#undef T_
#undef Y_
#undef L_
#undef E_
#undef CR
#undef NL
#undef PL
#undef xx


#define OK …
#define X_ …


static const int kMaxExitStateLettersMarksOnly = …;
static const int kMaxExitStateAllText = …;


// State machine to do cheap parse of non-letter strings incl. tags
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          ||  (0)
// advances <tag <tag2>
//          ||  (0)
//
// We start in state [0] at a non-letter and make at least one transition
// When scanning for just letters, arriving back at state [0] or [1] exits
//   the state machine.
// When scanning for any non-tag text, arriving at state [2] also exits
static const uint8 kTagParseTbl_0[] = …;

#undef OK
#undef X_

enum
{ … };

// Debugging. Not thread safe.
static char gDisplayPiece[32];
const uint8 gCharlen[16] = …;
char* DisplayPiece(const char* next_byte_, int byte_length_) { … }



// runetochar copies (encodes) one rune, pointed to by r, to at most
// UTFmax bytes starting at s and returns the number of bytes generated.
int runetochar(char *str, const char32 *rune) { … }



// Useful for converting an entity to an ascii value.
// RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
int LookupEntity(const char* entity_name, int entity_len) { … }

bool ascii_isdigit(char c) { … }
bool ascii_isxdigit(char c) { … }
bool ascii_isalnum(char c) { … }
int hex_digit_to_int(char c) { … }

static int32 strto32_base10(const char* nptr, const char* limit,
                            const char **endptr) { … }

static int32 strto32_base16(const char* nptr, const char* limit,
                            const char **endptr) { … }

// Unescape the current character pointed to by src.  SETS the number
// of chars read for the conversion (in UTF8).  If src isn't a valid entity,
// just consume the & and RETURN -1.  If src doesn't point to & -- which it
// should -- set src_consumed to 0 and RETURN -1.
int ReadEntity(const char* src, int srcn, int* src_consumed) { … }


// Src points to '&'
// Writes entity value to dst. Returns take(src), put(dst) byte counts
void EntityToBuffer(const char* src, int len, char* dst,
                    int* tlen, int* plen) { … }

// Returns true if character is < > or &, none of which are letters
bool inline IsSpecial(char c) { … }

// Quick Skip to next letter or < > & or to end of string (eos)
// Always return is_letter for eos
int ScanToLetterOrSpecial(const char* src, int len) { … }




// src points to non-letter, such as tag-opening '<'
// Return length from here to next possible letter
// On another < before >, return 1
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          |    | end of string
// advances <tag <tag2>
//          ||
int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { … }

// Returns mid if key found in lo <= mid < hi, else -1
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { … }

// Returns the length in bytes of the prefix of src that is all
//  interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length) { … }

ScriptScanner::ScriptScanner(const char* buffer,
                             int buffer_length,
                             bool is_plain_text)
  : … { … }

// Extended version to allow spans of any non-tag text and spans of mixed script
ScriptScanner::ScriptScanner(const char* buffer,
                             int buffer_length,
                             bool is_plain_text,
                             bool any_text,
                             bool any_script)
  : … { … }


ScriptScanner::~ScriptScanner() { … }




// Get to the first real non-tag letter or entity that is a letter
// Sets script of that letter
// Return len if no more letters
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { … }


// These are for ASCII-only tag names
// Compare one letter uplow to c, ignoring case of uplowp
inline bool EqCase(char uplow, char c) { … }

// These are for ASCII-only tag names
// Return true for space / < > etc. all less than 0x40
inline bool NeqLetter(char c) { … }

// These are for ASCII-only tag names
// Return true for space \n false for \r
inline bool WS(char c) { … }

// Canonical CR or LF
static const char LF = …;


// The naive loop scans from next_byte_ to script_buffer_ until full.
// But this can leave an awkward hard-to-identify short fragment at the
// end of the input. We would prefer to make the next-to-last fragment
// shorter and the last fragment longer.

// Copy next run of non-tag characters to buffer [NUL terminated]
// This just replaces tags with space or \n and removes entities.
// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
// including \r or \n are replaced by \n. All other tags and skipped text
// are replaced with ASCII space.
//
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneTextSpan(LangSpan* span) { … }


// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { … }

// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
// List changes with each version of Unicode, so just always lowercase
// Unicode 6.2.0:
//   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
void ScriptScanner::LowerScriptSpan(LangSpan* span) { … }

// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin, Cyrillic, Greek scripts to be lowercase
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { … }

// Maps byte offset in most recent GetOneScriptSpan/Lower
// span->text [0..text_bytes] into an additional byte offset from
// span->offset, to get back to corresponding text in the original
// input buffer.
// text_offset must be the first byte
// of a UTF-8 character, or just beyond the last character. Normally this
// routine is called with the first byte of an interesting range and
// again with the first byte of the following range.
int ScriptScanner::MapBack(int text_offset) { … }


// Gets lscript number for letters; always returns
//   0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src) { … }

}  // namespace CLD2
}  // namespace chrome_lang_id
chromium/third_party/cld_3/src/src/script_span/getonescriptspan.cc