compact_enc_det.cc | Explore in Territory

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

#include "compact_enc_det/compact_enc_det.h"

#include <math.h>                       // for sqrt
#include <stddef.h>                     // for size_t
#include <stdio.h>                      // for printf, fprintf, NULL, etc
#include <stdlib.h>                     // for qsort
#include <string.h>                     // for memset, memcpy, memcmp, etc
#include <memory>
#include <string>                       // for string, operator==, etc

#include "compact_enc_det/compact_enc_det_hint_code.h"
#include "util/string_util.h"
#include "util/basictypes.h"
#include "util/commandlineflags.h"
#include "util/logging.h"

string;

// TODO as of 2007.10.09:
//
// Consider font=TT-BHxxx as user-defined => binary
// Demote GB18030 if no 8x3x pair
// Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
// Consider removing/ignoring bytes 01-1F to avoid crap pollution
// Possibly boost declared encoding in robust scan
// googlebot tiny files
// look for ranges of encodings
// consider tags just as > < within aligned block of 32
// flag too few characters in postproc (Latin 6 problem)
// Remove slow scan beyond 16KB
// Consider removing kMostLikelyEncoding or cut it in half


// A note on mixed encodings
//
// The most common encoding error on the web is a page containing a mixture of
// CP-1252 and UTF-8. A less common encoding error is a third-party feed that
// has been converted from CP-1252 to UTF-8 and then those bytes converted a
// second time to UTF-8. CED originally attempted to detect these error cases
// by using two  synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
// implementation was to start these just below CP1252 and UTF8 respectively in
// overall  liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
// found.
//
// The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
// UTF8CP1252 internal encoding was added late and not put into encodings.proto,
// so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
// is removed in this November 2011 CL.
//
// Mixed encoding detection never worked out as well as envisioned, so the
// ced_allow_utf8utf8 flag normally disables all this.
//
// The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
// UTF8, and the inputconverter code for UTF8 normally will convert bare
// CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
// and double-UTF-8 mixtures will be detected as UTF-8, and the double
// conversion will stand.
//
// However, it is occasionally useful to use CED to detect double-converted
// UTF-8 coming from third-party data feeds, so they can be fixed at the source.
// For this purpose, the  UTF8UTF8 encoding remains available under the
// ced_allow_utf8utf8 flag.
//
// When UTF8UTF8 is detected, the inputconverter code will undo the double
// conversion, giving good text.

// Norbert Runge has noted these words in CP1252 that are mistakenly identified
// as UTF-8 because of the last pair of characters:
//  NESTLÉ®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
//  drauß\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
//  Mutterschoß\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
//  Schoß\u201c           0xDF 0x93 U+00DF U+201C
//  weiß\u201c            0xDF 0x93 U+00DF U+00AB
//  Schnellfuß\u201c      0xDF 0x93 U+00DF U+201C
//  süß«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
// These four byte combinations now explicitly boost Latin1/CP1252.

// And for reference, here are a couple of Portuguese spellings
// that may be mistaken as double-byte encodings.
//   informações          0xE7 0xF5
//   traição              0xE7 0xE3


static const char* kVersion = …;

DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
                                       "to handle mixtures of CP1252 "
                                       "converted to UTF-8 zero, one, "
                                       "or two times");
DEFINE_int32(enc_detect_slow_max_kb, 16,
             "Maximum number of Kbytes to examine for "
             "7-bit-only (2022, Hz, UTF7) encoding detect. "
             "You are unlikely to want to change this.");
DEFINE_int32(enc_detect_fast_max_kb, 256,
             "Maximum number of Kbytes to examine for encoding detect. "
             "You are unlikely to want to change this.");

DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
             "difference 1st - 2nd to be considered reliable \n"
             "  2 corresponds to min 4x difference\n"
             "  4 corresponds to min 16x difference\n"
             "  8 corresponds to min 256x difference\n"
             "  10 corresponds to min 1024x difference\n"
             "  20 corresponds to min 1Mx difference.");

// Text debug output options
DEFINE_bool(enc_detect_summary, false,
            "Print first 16 interesting pairs at exit.");
DEFINE_bool(counts, false, "Count major-section usage");

// PostScript debug output options
DEFINE_bool(enc_detect_detail, false,
             "Print PostScript of every update, to stderr.");
DEFINE_bool(enc_detect_detail2, false,
             "More PostScript detail of every update, to stderr.");
DEFINE_bool(enc_detect_source, false, "Include source text in detail");
// Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
// lang_enc.cc

// Following flags are not in use. Replace them with constants to
// avoid static initialization.

//DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
//DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");

static const char* const FLAGS_enc_detect_watch1 = …;
static const char* const FLAGS_enc_detect_watch2 = …;

// Only for experiments. Delete soon.
DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");

// Demo-mode/debugging experiment
DEFINE_bool(demo_nodefault, false,
             "Default to all equal; no boost for declared encoding.");
DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");


static const int XDECILOG2 = …;             // Multiplier for log base 2 ** n/10
static const int XLOG2 = …;                // Multiplier for log base 2 ** n

static const int kFinalPruneDifference = …;
                                            // Final bits of minimum
                                            // probability difference 1st-nth
                                            // to be pruned

static const int kInititalPruneDifference = …;
                                            // Initial bits of minimum
                                            // probability difference 1st-nth
                                            // to be pruned
                                            //
static const int kPruneDiffDecrement = …;
                                            // Decrements bits of minimum
                                            // probability difference 1st-nth
                                            // to be pruned

static const int kSmallInitDiff = …;       // bits of minimum
                                            // probability difference, base to
                                            // superset encodings

static const int kBoostInitial = …;    // bits of boost for
                                            // initial byte patterns (BOM, 00)

static const int kBadPairWhack = …;    // bits of whack for
                                            // one bad pair

static const int kBoostOnePair = …;    // bits of boost for
                                            // one good pair in Hz, etc.

static const int kGentleOnePair = …;    // bits of boost for
                                            // one good sequence
                                            //
static const int kGentlePairWhack = …;       // bits of whack
                                            // for ill-formed sequence

static const int kGentlePairBoost = …;       // bits of boost
                                            // for well-formed sequence

static const int kDeclaredEncBoost = …;  // bits/10 of boost for
                                            // best declared encoding per bigram

static const int kBestEncBoost = …;     // bits/10 of boost for
                                            // best encoding per bigram

static const int kTrigramBoost = …; // bits of boost for Latin127 tri

static const int kMaxPairs = …;            // Max interesting pairs to look at
                                            // If you change this,
                                            // adjust *PruneDiff*

static const int kPruneMask = …;         // Prune every 8 interesting pairs


static const int kBestPairsCount = …;      // For first N pairs, do extra boost
                                            // based on most likely encoding
                                            // of pair over entire web

static const int kDerateHintsBelow = …;    // If we have fewer than N bigrams,
                                            // weaken the hints enough that
                                            // unhinted encodings have a hope of
                                            // rising to the top

static const int kMinRescanLength = …;    // Don't bother rescanning for
                                            // unreliable encoding if fewer
                                            // than this many bytes unscanned.
                                            // We will rescan at most last half
                                            // of this.

static const int kStrongBinary = …;  // Make F_BINARY the only encoding
static const int kWeakerBinary = …;   // Make F_BINARY likely encoding

// These are byte counts from front of file
static const int kBinaryHardAsciiLimit = …;  // Not binary if all ASCII
static const int kBinarySoftAsciiLimit = …;  //   "   if mostly ASCII

// We try here to avoid having title text dominate the encoding detection,
// for the not-infrequent error case of title in encoding1, body in encoding2:
// we want to bias toward encoding2 winning.
//
// kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
// rarely cut off mid-character in the original (not-yet-detected) encoding.
// This matters most for UTF-8 two- and three-byte codes and for
// Shift-JIS three-byte codes.
static const int kMaxBigramsTagTitleText = …;      // Keep only some tag text
static const int kWeightshiftForTagTitleText = …;   // Give text in tags, etc.
                                                    // 1/16 normal weight

static const int kStrongPairs = …;          // Let reliable enc with this many
                                            // pairs overcome missing hint

enum CEDInternalFlags { … };

// Forward declaration
Encoding InternalDetectEncoding(
    CEDInternalFlags flags, const char* text, int text_length,
    const char* url_hint, const char* http_charset_hint,
    const char* meta_charset_hint, const int encoding_hint,
    const Language language_hint,  // User interface lang
    const CompactEncDet::TextCorpusType corpus_type,
    bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
    Encoding* second_best_enc);

UnigramEntry;

//typedef struct {
//  uint8 b12[256*256]; // Bigram probability for aligned bigram
//} FullBigramEntry;


// Include all the postproc-generated tables here:
// RankedEncoding
// kMapToEncoding
// unigram_table
// kMostLIkelyEncoding
// kTLDHintProbs
// kCharsetHintProbs
// HintEntry, kMaxTldKey kMaxTldVector, etc.
// =============================================================================

#include "compact_enc_det/compact_enc_det_generated_tables.h"


#define F_ASCII …

#define F_BINARY …
#define F_UTF8UTF8 …
#define F_BIG5_CP950 …
#define F_Unicode …
// =============================================================================

// 7-bit encodings have at least one "interesting" byte value < 0x80
//   (00 0E 1B + ~)
// JIS 2022-cn 2022-kr hz utf7
// Unicode UTF-16 UTF-32
// 8-bit encodings have no interesting byte values < 0x80
static const uint32 kSevenBitActive = …;   // needs <80 to detect
static const uint32 kUTF7Active     = …;   // <80 and +
static const uint32 kHzActive       = …;   // <80 and ~
static const uint32 kIso2022Active  = …;   // <80 and 1B 0E 0F
static const uint32 kUTF8Active     = …;
static const uint32 kUTF8UTF8Active = …;
static const uint32 kUTF1632Active  = …;   // <80 and 00
static const uint32 kBinaryActive   = …;   // <80 and 00
static const uint32 kTwobyteCode    = …;   // Needs 8xxx
static const uint32 kIsIndicCode    = …;   //
static const uint32 kHighAlphaCode  = …;   // full alphabet in 8x-Fx
static const uint32 kHighAccentCode = …;   // accents in 8x-Fx
static const uint32 kEUCJPActive    = …;   // Have to mess with phase


// Debug only. not thread safe
static int encdet_used = …;
static int rescore_used = …;
static int rescan_used = …;
static int robust_used = …;
static int looking_used = …;
static int doing_used = …;


// For debugging only -- about 256B/entry times about 500 = 128KB
// TODO: only allocate this if being used
DetailEntry;

static int watch1_rankedenc = …;     // Debug. not threadsafe
static int watch2_rankedenc = …;     // Debug. not threadsafe
////static int next_detail_entry = 0;     // Debug. not threadsafe
////static DetailEntry details[kMaxPairs * 10];  // Allow 10 details per bigram
// End For debugging only

// Must match kTestPrintableAsciiTildePlus exit codes, minus one
enum PairSet { … };

// The reasons for pruning
enum PruneReason { … };

static const char* kWhatSetName[] = …;


// State for encodings that do shift-out/shift-in between one- and two-byte
// regions (ISO-2022-xx, HZ)
enum StateSoSi { … };

DetectEncodingState;


// Record a debug event that changes probabilities
void SetDetailsEncProb(DetectEncodingState* destatep,
                       int offset, int best_enc, const char* label) { … }

// Record a debug event that changes probabilities, copy offset
void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
                                 int best_enc, const char* label) { … }

// Record a debug event that changes probs and has simple text label
void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) { … }

// Record a debug event that is just a text label, no change in probs
void SetDetailsLabel(DetectEncodingState* destatep, const char* label) { … }


// Maps superset encodings to base, to see if 2 encodings are compatible
// (Non-identity mappings are marked "-->" below.)
static const Encoding kMapEncToBaseEncoding[] = …;

COMPILE_ASSERT(…);

// Maps base encodings to 0, supersets to 1+, undesired to -1
// (Non-identity mappings are marked "-->" below.)
static const int kMapEncToSuperLevel[] = …;

COMPILE_ASSERT(…);



// Subscripted by Encoding enum value
static const uint32 kSpecialMask[] = …;

COMPILE_ASSERT(…);


/***
  kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents

  ISO_8859_5,       // 4: Teragram ISO-8859-5 Cyrl      UL bd
  RUSSIAN_CP1251,   // 26: Teragram CP1251              UL cdef
  RUSSIAN_KOI8_R,   // 25: Teragram KOI8R               LU cdef
  RUSSIAN_KOI8_RU,  // 28: CP21866 aka KOI8_RU,         LU cdef
  RUSSIAN_CP866,     // 42                              89ae

  ISO_8859_6,       // 5: Teragram Arabic               nocase cde
  MSFT_CP1256,      // 35: used for Arabic              nocase cde

  ISO_8859_7,       // 6: Teragram Greek                UL cdef
  MSFT_CP1253,       // 41: used for Greek              UL cdef

  ISO_8859_8,       // 7: Teragram Hebrew               nocase ef
  MSFT_CP1255,      // 36: Logical Hebrew Microsoft     nocase ef
  ISO_8859_8_I,     // 37: Iso Hebrew Logical           nocase ef
  HEBREW_VISUAL,    // 38: Iso Hebrew Visual            nocase ef

  ISO_8859_11,      // 33: aka TIS-620, used for Thai   nocase abcde
  MSFT_CP874,       // 34: used for Thai                nocase abcde

  TSCII,             // 49                              8-f
  TAMIL_MONO,        // 50
  TAMIL_BI,          // 51
  JAGRAN,            // 52
  BHASKAR,           // 55 Indic encoding - Devanagari
  HTCHANAKYA,        // 56 Indic encoding - Devanagari
***/

// We can scan bytes using this at about 500 MB/sec 2.8GHz P4
// Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
// We allow FF, 0x0C, here because it gives a better result for old
// Ascii text formatted for a TTY
// non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
static const char kTestPrintableAsciiTildePlus[256] = …;

// We can scan bytes using this at about 550 MB/sec 2.8GHz P4
// Slow scan uses this, stopping on NUL ESC SO SI and bad C0
// after Hz and UTF7 are pruned away
// We allow Form Feed, 0x0C, here
static const char kTestPrintableAscii[256] = …;

// Used in first-four-byte testing
static const char kIsPrintableAscii[256] = …;


static const signed char kBase64Value[256] = …;


// Subscripted by <state, byte/16>
// Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
//
// Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
// which we can mis-parse as an error byte followed by good UTF-8:
//                                      B2 DBB8 D6BD E1B9B9
// To counteract this, we now require an ASCII7 byte to resync out
// of the error state
// Next problem: good UTF-8 with bad byte
// efbc a012 eea4 bee7 b280 c2b7
// efbca0 12 eea4be e7b280 c2b7
//        ^^ bad byte
// fix: change state0 byte 1x to be don't-care
//
// Short UTF-8 ending in ASCII7 byte should resync immediately:
// E0 20 E0 A6 AA should give one error and resync at 2nd E0
//
static const char kMiniUTF8State[8][16] = …;
// Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
static const char kMiniUTF8Count[8][16] = …;

// Subscripted by <state, f(byte1) + g(byte2)>
// where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
// and g(x) = (x >> 4) & 3        8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
//                                (no checking for illegal bytes)
// Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
// to detect two, so we can back-convert to one.
// zero one    two                 pattern
// ---- ------ ----------------    -----------------
// 81   C281   C382C281            C3->8x->C2->xx
// 98   CB9C   C38BC593            C3->8x->C5->xx
// C3   C383   C383C692            C3->8x->C6->xx
// C8   C388   C383CB86            C3->8x->CB->xx
// 83   C692   C386E28099          C3->8x->E2->xx->8x
// 80   E282AC C3A2E2809AC2AC      C3->A2->E2->xx->xx->Cx->xx
// 92   E28099 C3A2E282ACE284A2    C3->A2->E2->xx->xx->E2->xx->xx
//
// We also want to detect bare-byte extra UTF-8 conversions:
// zero one    two                 pattern
// ---- ------ ----------------    -----------------
// C3   C3     C383                C3->8x->C2->xx
// D3   D3     C393                C3->9x->C2->xx->C2->xx
// E3   E3     C3A3                C3->Ax->C2->xx->C2->xx->C2->xx
// F3   F3     C3B2                C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
//

/**
CP1252 => UTF8 => UTF8UTF8
80 => E282AC => C3A2E2809AC2AC
81 => C281 => C382C281
82 => E2809A => C3A2E282ACC5A1
83 => C692 => C386E28099
84 => E2809E => C3A2E282ACC5BE
85 => E280A6 => C3A2E282ACC2A6
86 => E280A0 => C3A2E282ACC2A0
87 => E280A1 => C3A2E282ACC2A1
88 => CB86 => C38BE280A0
89 => E280B0 => C3A2E282ACC2B0
8A => C5A0 => C385C2A0
8B => E280B9 => C3A2E282ACC2B9
8C => C592 => C385E28099
8D => C28D => C382C28D
8E => C5BD => C385C2BD
8F => C28F => C382C28F
90 => C290 => C382C290
91 => E28098 => C3A2E282ACCB9C
92 => E28099 => C3A2E282ACE284A2
93 => E2809C => C3A2E282ACC593
94 => E2809D => C3A2E282ACC29D
95 => E280A2 => C3A2E282ACC2A2
96 => E28093 => C3A2E282ACE2809C
97 => E28094 => C3A2E282ACE2809D
98 => CB9C => C38BC593
99 => E284A2 => C3A2E2809EC2A2
9A => C5A1 => C385C2A1
9B => E280BA => C3A2E282ACC2BA
9C => C593 => C385E2809C
9D => C29D => C382C29D
9E => C5BE => C385C2BE
9F => C5B8 => C385C2B8
A0 => C2A0 => C382C2A0
A1 => C2A1 => C382C2A1
A2 => C2A2 => C382C2A2
A3 => C2A3 => C382C2A3
A4 => C2A4 => C382C2A4
A5 => C2A5 => C382C2A5
A6 => C2A6 => C382C2A6
A7 => C2A7 => C382C2A7
A8 => C2A8 => C382C2A8
A9 => C2A9 => C382C2A9
AA => C2AA => C382C2AA
AB => C2AB => C382C2AB
AC => C2AC => C382C2AC
AD => C2AD => C382C2AD
AE => C2AE => C382C2AE
AF => C2AF => C382C2AF
B0 => C2B0 => C382C2B0
B1 => C2B1 => C382C2B1
B2 => C2B2 => C382C2B2
B3 => C2B3 => C382C2B3
B4 => C2B4 => C382C2B4
B5 => C2B5 => C382C2B5
B6 => C2B6 => C382C2B6
B7 => C2B7 => C382C2B7
B8 => C2B8 => C382C2B8
B9 => C2B9 => C382C2B9
BA => C2BA => C382C2BA
BB => C2BB => C382C2BB
BC => C2BC => C382C2BC
BD => C2BD => C382C2BD
BE => C2BE => C382C2BE
BF => C2BF => C382C2BF
C0 => C380 => C383E282AC
C1 => C381 => C383C281
C2 => C382 => C383E2809A
C3 => C383 => C383C692
C4 => C384 => C383E2809E
C5 => C385 => C383E280A6
C6 => C386 => C383E280A0
C7 => C387 => C383E280A1
C8 => C388 => C383CB86
C9 => C389 => C383E280B0
CA => C38A => C383C5A0
CB => C38B => C383E280B9
CC => C38C => C383C592
CD => C38D => C383C28D
CE => C38E => C383C5BD
CF => C38F => C383C28F
D0 => C390 => C383C290
D1 => C391 => C383E28098
D2 => C392 => C383E28099
D3 => C393 => C383E2809C
D4 => C394 => C383E2809D
D5 => C395 => C383E280A2
D6 => C396 => C383E28093
D7 => C397 => C383E28094
D8 => C398 => C383CB9C
D9 => C399 => C383E284A2
DA => C39A => C383C5A1
DB => C39B => C383E280BA
DC => C39C => C383C593
DD => C39D => C383C29D
DE => C39E => C383C5BE
DF => C39F => C383C5B8
E0 => C3A0 => C383C2A0
E1 => C3A1 => C383C2A1
E2 => C3A2 => C383C2A2
E3 => C3A3 => C383C2A3
E4 => C3A4 => C383C2A4
E5 => C3A5 => C383C2A5
E6 => C3A6 => C383C2A6
E7 => C3A7 => C383C2A7
E8 => C3A8 => C383C2A8
E9 => C3A9 => C383C2A9
EA => C3AA => C383C2AA
EB => C3AB => C383C2AB
EC => C3AC => C383C2AC
ED => C3AD => C383C2AD
EE => C3AE => C383C2AE
EF => C3AF => C383C2AF
F0 => C3B0 => C383C2B0
F1 => C3B1 => C383C2B1
F2 => C3B2 => C383C2B2
F3 => C3B3 => C383C2B3
F4 => C3B4 => C383C2B4
F5 => C3B5 => C383C2B5
F6 => C3B6 => C383C2B6
F7 => C3B7 => C383C2B7
F8 => C3B8 => C383C2B8
F9 => C3B9 => C383C2B9
FA => C3BA => C383C2BA
FB => C3BB => C383C2BB
FC => C3BC => C383C2BC
FD => C3BD => C383C2BD
FE => C3BE => C383C2BE
FF => C3BF => C383C2BF
**/

// Subscripted by <state, f(byte1) + g(byte2)>
// where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
// and g(x) = (x >> 4) & 3        8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.

// 81   C281   C382C281            C3->8x->C2->xx
// 98   CB9C   C38BC593            C3->8x->C5->xx
// C3   C383   C383C692            C3->8x->C6->xx
// C8   C388   C383CB86            C3->8x->CB->xx
//                                 [0]     [2]   [0]
// 83   C692   C386E28099          C3->8x->E2->xx->xx
//   odd_byte=0                    [0]     [2]       [0+]  odd_byte flipped
//   odd_byte=1                    [0+]    [2] [0]   [0]   odd_byte unflipped
// 80   E282AC C3A2E2809AC2AC      C3->A2->E2->xx->xx->Cx->xx
//   odd_byte=0                    [0]     [3]         [4]   [0+]
//   odd_byte=1                    [0+]    [3] [4]     [4]   [0]
// 92   E28099 C3A2E282ACE284A2    C3->A2->E2->xx->xx->E2->xx->xx
//   odd_byte=0                    [0]     [3]         [4] [0]   [0]
//   odd_byte=1                    [0+]    [3] [4]     [4]       [0+]
//
// When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
// the odd_byte state. If that goes from 0 to 1, the next pair is offset up
// by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
// from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
// These are absorbed with no error in state 0 or state 4
//
// C3   C3     C383                C3->8x->C2->xx
// D3   D3     C393                C3->9x->C2->xx->C2->xx
// E3   E3     C3A3                C3->Ax->C2->xx->C2->xx->C2->xx
// F3   F3     C3B2                C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
// Counter3 for Fx Ex sequences is incremented at last C2

static const char kMiniUTF8UTF8State[8][16] = …;
// Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
static const char kMiniUTF8UTF8Count[8][16] = …;

static const char kMiniUTF8UTF8Odd[8][16] = …;

// Turn a pair of bytes into the subscript for UTF8UTF8 tables above
int UTF88Sub(char s0, char s1) { … }





// Default probability for an encoding rankedencoding
// Based on a scan of 55M web pages
// These values are 255 - log base 2**1/10 (occurrences / total)
// Large values are most likely. This the reverse of some Google code
// 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
//
// TODO change this to be per encoding, not permuted
//


// Support function for unit test program
// Return ranked encoding corresponding to enc
// (also exported to compact_enc_det_text.cc)
int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) { … }


string DecodeActive(uint32 active) { … }

static inline bool SevenBitEncoding(int enc) { … }
static inline bool TwoByteEncoding(int enc) { … }
static inline bool IndicEncoding(int enc) { … }
static inline bool HighAlphaEncoding(int enc) { … }
static inline bool HighAccentEncoding(int enc) { … }


static inline bool AnyActive(DetectEncodingState* destatep) { … }
static inline bool SevenBitActive(DetectEncodingState* destatep) { … }
static inline bool HzActive(DetectEncodingState* destatep) { … }
static inline bool Iso2022Active(DetectEncodingState* destatep) { … }
static inline bool UTF8Active(DetectEncodingState* destatep) { … }
static inline bool UTF8UTF8Active(DetectEncodingState* destatep) { … }
static inline bool UTF1632Active(DetectEncodingState* destatep) { … }
static inline bool BinaryActive(DetectEncodingState* destatep) { … }
static inline bool UTF7OrHzActive(DetectEncodingState* destatep) { … }
static inline bool EUCJPActive(DetectEncodingState* destatep) { … }
static inline bool OtherActive(DetectEncodingState* destatep) { … }


static inline bool CEDFlagRescanning(CEDInternalFlags flags) { … }

static inline bool CEDFlagForceTags(CEDInternalFlags flags) { … }


static inline int maxint(int a, int b) { … }
static inline int minint(int a, int b) { … }

static inline const char* MyRankedEncName(int r_enc) { … }


// Only for debugging. not thread safe
static const int kPsSourceWidth = …;
static int pssourcenext = …;    // debug only. not threadsafe. dump only >= this
static int pssourcewidth = …;   // debug only.
static char* pssource_mark_buffer = …;
int next_do_src_line;
int do_src_offset[16];


void PsSourceInit(int len) { … }

void PsSourceFinish() { … }

// Dump aligned len bytes src... if not already dumped
void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) { … }

// Mark bytes in just-previous source bytes
void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) { … }


// Highlight trigram bytes in just-previous source bytes
// Unfortunately, we have to skip back N lines since source was printed for
// up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) { … }


void InitDetectEncodingState(DetectEncodingState* destatep) { … }

// Probability strings are uint8, with zeros removed via simple run-length:
//  (<skip-take byte> <data bytes>)*
// skip-take:
//  00  end
//  x0  skip 16 x locations, take 0 data values
//  xy  skip x locations, take y data values
// Multiply all the incoming values by 3 to account for 3x unigram sums
//
// {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
//   0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
//
// Weight is 0..100 percent
//
// Returns subscript of largest (most probable) value
//


//  {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
//        // ASCII-7-bit=178  Latin1=174  UTF8=160  GB=50  CP1252=161  BIG5=49  Latin2=66  CP1251=57  CP1256=59  CP1250=51  Latin5=69  ISO-8859-15=111  [top ASCII-7-bit]
int ApplyCompressedProb(const char* iprob, int len,
                         int weight, DetectEncodingState* destatep) { … }


// Returns subscript of largest (most probable) value [for unit test]
int TopCompressedProb(const char* iprob, int len) { … }


// Find subscript of matching key in first 8 bytes of sorted hint array, or -1
int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
                     const char* norm_key) { … }

// Find subscript of matching key in first 4 bytes of sorted hint array, or -1
int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
                     const char* norm_key) { … }

static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) { … }

static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) { … }

// Apply initial probability hint based on top level domain name
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyTldHint(const char* url_tld_hint, int weight,
                  DetectEncodingState* destatep) { … }

// Apply initial probability hint based on charset= name
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyCharsetHint(const char* charset_hint, int weight,
                      DetectEncodingState* destatep) { … }

// Apply initial probability hint based on caller-supplied encoding
// Negative hint whacks ~encoding, non-negative boosts encoding
//
// Negative hints are an experiment to see if they might be useful.
// Not operator used instead of unary minus to allow specifying not-zero
int ApplyEncodingHint(const int encoding_hint, int weight,
                       DetectEncodingState* destatep) { … }

// Apply initial probability hint based on user interface language
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyUILanguageHint(const Language language_hint,
                        int weight, DetectEncodingState* destatep) { … }

// Apply initial probability hint based on corpus type (web, email, etc)
// Return 1 if name match found
int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
                      DetectEncodingState* destatep) { … }



// Do reverse search for c in [str..str+len)
// Note: initial pointer is to FRONT of string, not back
const char* MyMemrchr(const char* str, char c, size_t len) { … }


// Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
// Now that we are no longer trying to do Indic font-based encodigns, we
// don't need the full URL and can go back to simple TLD. This test remains for
// backwards compatility with any caller using full URL.
static const int kMinURLLength = …;

// Extract TLD from a full URL or just a TLD
// Return hostname and length if a full URL
void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
                const char** ret_host_start, int* ret_host_len) { … }

// Apply hints, if any, to probabilities
// NOTE: Encoding probabilites are all zero at this point
void ApplyHints(const char* url_hint,
                const char* http_charset_hint,
                const char* meta_charset_hint,
                const int encoding_hint,
                const Language language_hint,
                const CompactEncDet::TextCorpusType corpus_type,
                DetectEncodingState* destatep) { … }

// Look for specific high-value patterns in the first 4 bytes
// Byte order marks (BOM)
//  EFBBBF    UTF-8
//  FEFF      UTF-16 BE
//  FFFE      UTF-16 LE
//  FFFE0000  UTF-32 BE
//  0000FEFF  UTF-32 LE
//
// Likely UTF-x of seven-bit ASCII
//  00xx      UTF-16 BE  xx printable ASCII
//  xx00      UTF-16 LE
//  000000xx  UTF-32 BE
//  xx000000  UTF-32 LE
//
void InitialBytesBoost(const uint8* src,
                       int text_length,
                       DetectEncodingState* destatep) { … }



// Descending order
int IntCompare(const void* v1, const void* v2) { … }

bool Base64Char(uint8 c) { … }

int Base64ScanLen(const uint8* start, const uint8* limit) { … }

// Input is at least 8-character legal base64 string after +.
// But might be say + "Presse+Termine"
bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) { … }

// Prune here after N bytes
// Boost here for seven-bit sequences (at every prune)
// if (sevenbitrankedencoding)
//   + UTF7   scan and boost/demote len mod 8 = 0 3 6
//   ~ Hz     scan and boost/demote len mod 8 = 0 2 4 6
//   1B 2022  scan and boost/demote len mod 8 = 0 2 4 6
//   0E 2022  scan and boost/demote len mod 8 = 0 2 4 6
//   [0F 2022  boost/demote]
//   00 UTF16/32  scan and boost/demote offset = even/odd
//
// If still some seven-bit possibilities > pure ASCII,
// scan each possibility for clearer prob, s.t. about
// two good sequences is a clear win
// A-Z 00-19 00xx-64xx   (B = 04xx)
// a-z 1A-33 68xx-CCxx   (f = 7Cxx)
// 0-9 34-3D D0xx-F4xx   (1 = D4xx)
// +   3E    F8xx
// /   3F    FCxx
// do another chunk  with slow scan


// Boost, whack, or leave alone UTF-7 probablilty
void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) { … }

// Boost, whack, or leave alone HZ probablilty
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) { … }

// Boost, whack, or leave alone BINARY probablilty
void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … }


// Demote UTF-16/32 on 0000 or FFFF, favoring Binary
void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) { … }

// Make even offset
void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) { … }

bool ConsecutivePair(DetectEncodingState* destatep, int i) { … }

// boost, whack, or leave alone UTF-8 probablilty
// Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
// Returns total boost
int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) { … }


// Boost, whack, or leave alone UTF8UTF8 probablilty
//
// We are looking for
// (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the
//     MS CP1252 mappings, and
// (2) sequences of 2 or more such characters
//
// If so, we could be looking at some non-7-bit encoding extra-converted
// to UTF-8. The most common observed is CP1252->UTF8 twice,
//    1252=>UTF8 : 1252=>UTF8
// where the colon means "take those bytes and pretend that they are 1252".
// We have a couple of examples of BIG5 bytes converted as though
// they were 1252,
//    BIG5 : 1252=>UTF8
//
// Of course, we don't want correctly converted 1252 to be flagged here
//    1252=>UTF8
// So we want the input high bytes to be in pairs or longer, hence the
// output UTF8 in groups of four bytes or more
//
// Good chars: C2xx, C3xx,
// Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C
// Good chars: E280xx E282AC E284A2
//             C2xx 1100001x 10xxxxxx   (128/128)
//             C5xx 11000101 10xx00xx   (16/4)
//             C5xx 11000101 10111xxx   (8/3)
//             C692 11000110 10010010   (1/1)
//             CBxx 11001011 100xx1x0   (8/2)
//             E28x 11100010 10000xx0   (4/3)
//
// Returns total boost
int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) { … }


// We give a gentle boost for each paired SO ... SI, whack others
void CheckIso2022ActiveSeq(DetectEncodingState* destatep) { … }

// We give a gentle boost for each paired ~{ ... ~}, whack others
void CheckHzActiveSeq(DetectEncodingState* destatep) { … }

// We give a gentle boost after an odd number of 8Fxxxx triples, which
// put subsequent bigrams out of phase until a low byte or another 8Fxxxx
void CheckEucJpSeq(DetectEncodingState* destatep) { … }

// Boost, whack, or leave alone BINARY probablilty
// Also called if UTF 16/32 active
void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,
                        int delta_otherpairs) { … }


// Look at a number of special-case encodings whose reliable detection depends
// on sequencing or other properties
// AsciiPair probibilities (UTF7 and HZ) are all done here
void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) { … }


void PrintTopEnc(DetectEncodingState* destatep, int n) { … }

// If the same bigram repeats, don't boost its best encoding too much
bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … }

// Sometimes illegal bytes are used as markers between text that Javascript
// is going to decode. Don't overboost the Binary encoding for markers 01-FF.
// Just count first pair per 8x4 bucket
bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … }




// Find current top two rankedencoding probabilities
void ReRank(DetectEncodingState* destatep) { … }

void SimplePrune(DetectEncodingState* destatep, int prune_diff) { … }

// Recalculate reliable
void CalcReliable(DetectEncodingState* destatep) { … }


// Find current top two rankedencoding probabilities
void FindTop2(DetectEncodingState* destatep,
              int* first_renc, int* second_renc,
              int* first_prob, int* second_prob) { … }


void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) { … }




// Map unencoded bytes down to five bits, largely preserving letters
// This design struggles to put 33 values into 5 bits.
#define XX …
#define HA …
#define HE …
#define HI …
#define HO …
#define HU …
#define Hc …
static const char kMapToFiveBits[256] = …;
#undef XX
#undef HA
#undef HE
#undef HI
#undef HO
#undef HU
#undef Hc

static const int kTriLatin1Likely = …;
static const int kTriLatin2Likely = …;
static const int kTriLatin7Likely = …;

// Each table entry has 32 times two bits, selected by byte[2]
// Entry subscript is selected by byte[0] and byte[1]
// Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc
static const uint64 kLatin127Trigrams[1024] = …;
// Latin1 6%, Latin2 11%, Latin7 3%



// Just for debugging. not thread-safe
static char tri_string[4];
char* Latin127Str(int trisub) { … }

// Returns two bits per three-byte trigram, indicating
// dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely
int TrigramValue(const uint8* trisrc) { … }


// Put out trigrams for surrounding 32 bytes for Latin encodings
// Return true if more Latin2 & 7 than Latin1
bool BoostLatin127Trigrams(int tri_block_offset,
                           DetectEncodingState* destatep) { … }



// Boost any encodings that need extra detection help, then prune
// src is first unscanned byte
// slowend means extra pruning when dropping out of initial slow scan
// final means last call -- no bigram at src
void BoostPrune(const uint8* src, DetectEncodingState* destatep,
                int prunereason) { … }


// Accumulate aligned byte-pair at src
// Occasionally, calc boost for some encodings and then prune the active list
// weightshift is used to give low weight some text, such as inside tags
// Returns true if pruning occurred
bool IncrementAndBoostPrune(const uint8* src,
                            int remaining_length,
                            DetectEncodingState* destatep,
                            int weightshift,
                            int exit_reason) { … }

void DumpSummary(DetectEncodingState* destatep, int whatset, int n) { … }

void BeginDetail(DetectEncodingState* destatep) { … }

// Single character to represent (printable ASCII) gap between bigrams
char DetailOffsetChar(int delta) { … }

void DumpDetail(DetectEncodingState* destatep) { … }

void PsRecurse(const char* buff) { … }

void DumpReliable(DetectEncodingState* destatep) { … }

// Scan short single lines quickly for all printable ASCII
// Return true if all bytes are in [20..7F], false otherwise
bool QuickPrintableAsciiScan(const char* text, int text_length) { … }

static const int kMaxScanBack = …;

// Return true if text is inside a tag or JS comment
bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) { … }

const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) { … }


// Take a watch string and map to a ranked encoding. If no match, return -1
int LookupWatchEnc(const string& watch_str) { … }

// Return true if enc and enc2 are equal or one is a subset of the other
// or either is UNKNOWN
// also UTF8UTF8 is compatible with both Latin1 and UTF8
bool CompatibleEnc(Encoding enc, Encoding enc2) { … }

// Return superset of enc and enc2, which must be compatible
Encoding SupersetEnc(Encoding enc, Encoding enc2) { … }


// If unreliable, try rescoring to separate some encodings
Encoding Rescore(Encoding enc, const uint8* isrc,
                 const uint8* srctextlimit, DetectEncodingState* destatep) { … }


// Given an encoding, add its corresponding ranked encoding to the set
void AddToSet(Encoding enc, int* list_len, int* list) { … }


static const int kMinRobustBigramCount = …;
static const int kMinKBToRobustScan = …;
static const int kMaxKBToRobustScan = …;

// Scan the first 64K or so, just doing raw bigram increments on given
// probability list.
// No fancy duplicate filtering or anything else here.
// Returns number of bigrams counted
int RobustScan(const char* text,
                int text_length,
                int robust_renc_list_len,
                int* robust_renc_list,
                int* robust_renc_probs) { … }

// If unreliable, rescan middle of document to see if we can get a better
// answer. Rescan is only worthwhile if there are ~200 bytes or more left,
// since the detector takes as much as 96 bytes of bigrams to decide.
Encoding Rescan(Encoding enc,
                const uint8* isrc,
                const uint8* src,
                const uint8* srctextlimit,
                const char* url_hint,
                const char* http_charset_hint,
                const char* meta_charset_hint,
                const int encoding_hint,
                const Language language_hint,
                const CompactEncDet::TextCorpusType corpus_type,
                bool ignore_7bit_mail_encodings,
                DetectEncodingState* destatep) { … }

// With no hints at all, and perhaps on rescan, we relax our pickiness
// and go ahead and accept the top multibyte encodings, even though
// strictly their web pages should have declared an explicit encoding to
// avoid the HTML standard's default ISO-8859-1.
bool NoHintsCloseEnoughCompatible(Encoding top_enc) { … }



// Scan raw bytes and detect most likely encoding
// Design goals:
//   Skip over big initial stretches of seven-bit ASCII bytes very quickly
//   Thread safe
//   Works equally well on
//    50-byte queries,
//    5000-byte email and
//    50000-byte web pages
// Length 0 input returns ISO_8859_1 (ASCII) encoding
// Setting ignore_7bit_mail_encodings effectively turns off detection of
//  UTF-7, HZ, and ISO-2022-xx
Encoding InternalDetectEncoding(
    CEDInternalFlags flags, const char* text, int text_length,
    const char* url_hint, const char* http_charset_hint,
    const char* meta_charset_hint, const int encoding_hint,
    const Language language_hint,  // User interface lang
    const CompactEncDet::TextCorpusType corpus_type,
    bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
    Encoding* second_best_enc) { … }

Encoding CompactEncDet::DetectEncoding(
    const char* text, int text_length, const char* url_hint,
    const char* http_charset_hint, const char* meta_charset_hint,
    const int encoding_hint,
    const Language language_hint,  // User interface lang
    const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
    int* bytes_consumed, bool* is_reliable) { … }


// Return top encoding hint for given string
Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) { … }

// Return top encoding hint for given string
Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) { … }

// Return top encoding hint for given string
Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) { … }

const char* CompactEncDet::Version(void) { … }
chromium/third_party/ced/src/compact_enc_det/compact_enc_det.cc