// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // //////////////////////////////////////////////////////////////////////////////// #include "compact_enc_det/compact_enc_det.h" #include <math.h> // for sqrt #include <stddef.h> // for size_t #include <stdio.h> // for printf, fprintf, NULL, etc #include <stdlib.h> // for qsort #include <string.h> // for memset, memcpy, memcmp, etc #include <memory> #include <string> // for string, operator==, etc #include "compact_enc_det/compact_enc_det_hint_code.h" #include "util/string_util.h" #include "util/basictypes.h" #include "util/commandlineflags.h" #include "util/logging.h" string; // TODO as of 2007.10.09: // // Consider font=TT-BHxxx as user-defined => binary // Demote GB18030 if no 8x3x pair // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires // Consider removing/ignoring bytes 01-1F to avoid crap pollution // Possibly boost declared encoding in robust scan // googlebot tiny files // look for ranges of encodings // consider tags just as > < within aligned block of 32 // flag too few characters in postproc (Latin 6 problem) // Remove slow scan beyond 16KB // Consider removing kMostLikelyEncoding or cut it in half // A note on mixed encodings // // The most common encoding error on the web is a page containing a mixture of // CP-1252 and UTF-8. A less common encoding error is a third-party feed that // has been converted from CP-1252 to UTF-8 and then those bytes converted a // second time to UTF-8. CED originally attempted to detect these error cases // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended // implementation was to start these just below CP1252 and UTF8 respectively in // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are // found. // // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the // UTF8CP1252 internal encoding was added late and not put into encodings.proto, // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and // is removed in this November 2011 CL. // // Mixed encoding detection never worked out as well as envisioned, so the // ced_allow_utf8utf8 flag normally disables all this. // // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as // UTF8, and the inputconverter code for UTF8 normally will convert bare // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8 // and double-UTF-8 mixtures will be detected as UTF-8, and the double // conversion will stand. // // However, it is occasionally useful to use CED to detect double-converted // UTF-8 coming from third-party data feeds, so they can be fixed at the source. // For this purpose, the UTF8UTF8 encoding remains available under the // ced_allow_utf8utf8 flag. // // When UTF8UTF8 is detected, the inputconverter code will undo the double // conversion, giving good text. // Norbert Runge has noted these words in CP1252 that are mistakenly identified // as UTF-8 because of the last pair of characters: // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA // Schoß\u201c 0xDF 0x93 U+00DF U+201C // weiß\u201c 0xDF 0x93 U+00DF U+00AB // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE // These four byte combinations now explicitly boost Latin1/CP1252. // And for reference, here are a couple of Portuguese spellings // that may be mistaken as double-byte encodings. // informações 0xE7 0xF5 // traição 0xE7 0xE3 static const char* kVersion = …; DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, " "to handle mixtures of CP1252 " "converted to UTF-8 zero, one, " "or two times"); DEFINE_int32(enc_detect_slow_max_kb, 16, "Maximum number of Kbytes to examine for " "7-bit-only (2022, Hz, UTF7) encoding detect. " "You are unlikely to want to change this."); DEFINE_int32(enc_detect_fast_max_kb, 256, "Maximum number of Kbytes to examine for encoding detect. " "You are unlikely to want to change this."); DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility " "difference 1st - 2nd to be considered reliable \n" " 2 corresponds to min 4x difference\n" " 4 corresponds to min 16x difference\n" " 8 corresponds to min 256x difference\n" " 10 corresponds to min 1024x difference\n" " 20 corresponds to min 1Mx difference."); // Text debug output options DEFINE_bool(enc_detect_summary, false, "Print first 16 interesting pairs at exit."); DEFINE_bool(counts, false, "Count major-section usage"); // PostScript debug output options DEFINE_bool(enc_detect_detail, false, "Print PostScript of every update, to stderr."); DEFINE_bool(enc_detect_detail2, false, "More PostScript detail of every update, to stderr."); DEFINE_bool(enc_detect_source, false, "Include source text in detail"); // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in // lang_enc.cc // Following flags are not in use. Replace them with constants to // avoid static initialization. //DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name."); //DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name."); static const char* const FLAGS_enc_detect_watch1 = …; static const char* const FLAGS_enc_detect_watch2 = …; // Only for experiments. Delete soon. DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams"); // Demo-mode/debugging experiment DEFINE_bool(demo_nodefault, false, "Default to all equal; no boost for declared encoding."); DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings"); DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr"); static const int XDECILOG2 = …; // Multiplier for log base 2 ** n/10 static const int XLOG2 = …; // Multiplier for log base 2 ** n static const int kFinalPruneDifference = …; // Final bits of minimum // probability difference 1st-nth // to be pruned static const int kInititalPruneDifference = …; // Initial bits of minimum // probability difference 1st-nth // to be pruned // static const int kPruneDiffDecrement = …; // Decrements bits of minimum // probability difference 1st-nth // to be pruned static const int kSmallInitDiff = …; // bits of minimum // probability difference, base to // superset encodings static const int kBoostInitial = …; // bits of boost for // initial byte patterns (BOM, 00) static const int kBadPairWhack = …; // bits of whack for // one bad pair static const int kBoostOnePair = …; // bits of boost for // one good pair in Hz, etc. static const int kGentleOnePair = …; // bits of boost for // one good sequence // static const int kGentlePairWhack = …; // bits of whack // for ill-formed sequence static const int kGentlePairBoost = …; // bits of boost // for well-formed sequence static const int kDeclaredEncBoost = …; // bits/10 of boost for // best declared encoding per bigram static const int kBestEncBoost = …; // bits/10 of boost for // best encoding per bigram static const int kTrigramBoost = …; // bits of boost for Latin127 tri static const int kMaxPairs = …; // Max interesting pairs to look at // If you change this, // adjust *PruneDiff* static const int kPruneMask = …; // Prune every 8 interesting pairs static const int kBestPairsCount = …; // For first N pairs, do extra boost // based on most likely encoding // of pair over entire web static const int kDerateHintsBelow = …; // If we have fewer than N bigrams, // weaken the hints enough that // unhinted encodings have a hope of // rising to the top static const int kMinRescanLength = …; // Don't bother rescanning for // unreliable encoding if fewer // than this many bytes unscanned. // We will rescan at most last half // of this. static const int kStrongBinary = …; // Make F_BINARY the only encoding static const int kWeakerBinary = …; // Make F_BINARY likely encoding // These are byte counts from front of file static const int kBinaryHardAsciiLimit = …; // Not binary if all ASCII static const int kBinarySoftAsciiLimit = …; // " if mostly ASCII // We try here to avoid having title text dominate the encoding detection, // for the not-infrequent error case of title in encoding1, body in encoding2: // we want to bias toward encoding2 winning. // // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we // rarely cut off mid-character in the original (not-yet-detected) encoding. // This matters most for UTF-8 two- and three-byte codes and for // Shift-JIS three-byte codes. static const int kMaxBigramsTagTitleText = …; // Keep only some tag text static const int kWeightshiftForTagTitleText = …; // Give text in tags, etc. // 1/16 normal weight static const int kStrongPairs = …; // Let reliable enc with this many // pairs overcome missing hint enum CEDInternalFlags { … }; // Forward declaration Encoding InternalDetectEncoding( CEDInternalFlags flags, const char* text, int text_length, const char* url_hint, const char* http_charset_hint, const char* meta_charset_hint, const int encoding_hint, const Language language_hint, // User interface lang const CompactEncDet::TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable, Encoding* second_best_enc); UnigramEntry; //typedef struct { // uint8 b12[256*256]; // Bigram probability for aligned bigram //} FullBigramEntry; // Include all the postproc-generated tables here: // RankedEncoding // kMapToEncoding // unigram_table // kMostLIkelyEncoding // kTLDHintProbs // kCharsetHintProbs // HintEntry, kMaxTldKey kMaxTldVector, etc. // ============================================================================= #include "compact_enc_det/compact_enc_det_generated_tables.h" #define F_ASCII … #define F_BINARY … #define F_UTF8UTF8 … #define F_BIG5_CP950 … #define F_Unicode … // ============================================================================= // 7-bit encodings have at least one "interesting" byte value < 0x80 // (00 0E 1B + ~) // JIS 2022-cn 2022-kr hz utf7 // Unicode UTF-16 UTF-32 // 8-bit encodings have no interesting byte values < 0x80 static const uint32 kSevenBitActive = …; // needs <80 to detect static const uint32 kUTF7Active = …; // <80 and + static const uint32 kHzActive = …; // <80 and ~ static const uint32 kIso2022Active = …; // <80 and 1B 0E 0F static const uint32 kUTF8Active = …; static const uint32 kUTF8UTF8Active = …; static const uint32 kUTF1632Active = …; // <80 and 00 static const uint32 kBinaryActive = …; // <80 and 00 static const uint32 kTwobyteCode = …; // Needs 8xxx static const uint32 kIsIndicCode = …; // static const uint32 kHighAlphaCode = …; // full alphabet in 8x-Fx static const uint32 kHighAccentCode = …; // accents in 8x-Fx static const uint32 kEUCJPActive = …; // Have to mess with phase // Debug only. not thread safe static int encdet_used = …; static int rescore_used = …; static int rescan_used = …; static int robust_used = …; static int looking_used = …; static int doing_used = …; // For debugging only -- about 256B/entry times about 500 = 128KB // TODO: only allocate this if being used DetailEntry; static int watch1_rankedenc = …; // Debug. not threadsafe static int watch2_rankedenc = …; // Debug. not threadsafe ////static int next_detail_entry = 0; // Debug. not threadsafe ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram // End For debugging only // Must match kTestPrintableAsciiTildePlus exit codes, minus one enum PairSet { … }; // The reasons for pruning enum PruneReason { … }; static const char* kWhatSetName[] = …; // State for encodings that do shift-out/shift-in between one- and two-byte // regions (ISO-2022-xx, HZ) enum StateSoSi { … }; DetectEncodingState; // Record a debug event that changes probabilities void SetDetailsEncProb(DetectEncodingState* destatep, int offset, int best_enc, const char* label) { … } // Record a debug event that changes probabilities, copy offset void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep, int best_enc, const char* label) { … } // Record a debug event that changes probs and has simple text label void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) { … } // Record a debug event that is just a text label, no change in probs void SetDetailsLabel(DetectEncodingState* destatep, const char* label) { … } // Maps superset encodings to base, to see if 2 encodings are compatible // (Non-identity mappings are marked "-->" below.) static const Encoding kMapEncToBaseEncoding[] = …; COMPILE_ASSERT(…); // Maps base encodings to 0, supersets to 1+, undesired to -1 // (Non-identity mappings are marked "-->" below.) static const int kMapEncToSuperLevel[] = …; COMPILE_ASSERT(…); // Subscripted by Encoding enum value static const uint32 kSpecialMask[] = …; COMPILE_ASSERT(…); /*** kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef RUSSIAN_CP866, // 42 89ae ISO_8859_6, // 5: Teragram Arabic nocase cde MSFT_CP1256, // 35: used for Arabic nocase cde ISO_8859_7, // 6: Teragram Greek UL cdef MSFT_CP1253, // 41: used for Greek UL cdef ISO_8859_8, // 7: Teragram Hebrew nocase ef MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde MSFT_CP874, // 34: used for Thai nocase abcde TSCII, // 49 8-f TAMIL_MONO, // 50 TAMIL_BI, // 51 JAGRAN, // 52 BHASKAR, // 55 Indic encoding - Devanagari HTCHANAKYA, // 56 Indic encoding - Devanagari ***/ // We can scan bytes using this at about 500 MB/sec 2.8GHz P4 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~ // We allow FF, 0x0C, here because it gives a better result for old // Ascii text formatted for a TTY // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise static const char kTestPrintableAsciiTildePlus[256] = …; // We can scan bytes using this at about 550 MB/sec 2.8GHz P4 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0 // after Hz and UTF7 are pruned away // We allow Form Feed, 0x0C, here static const char kTestPrintableAscii[256] = …; // Used in first-four-byte testing static const char kIsPrintableAscii[256] = …; static const signed char kBase64Value[256] = …; // Subscripted by <state, byte/16> // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x // // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9 // which we can mis-parse as an error byte followed by good UTF-8: // B2 DBB8 D6BD E1B9B9 // To counteract this, we now require an ASCII7 byte to resync out // of the error state // Next problem: good UTF-8 with bad byte // efbc a012 eea4 bee7 b280 c2b7 // efbca0 12 eea4be e7b280 c2b7 // ^^ bad byte // fix: change state0 byte 1x to be don't-care // // Short UTF-8 ending in ASCII7 byte should resync immediately: // E0 20 E0 A6 AA should give one error and resync at 2nd E0 // static const char kMiniUTF8State[8][16] = …; // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B static const char kMiniUTF8Count[8][16] = …; // Subscripted by <state, f(byte1) + g(byte2)> // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. // (no checking for illegal bytes) // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want // to detect two, so we can back-convert to one. // zero one two pattern // ---- ------ ---------------- ----------------- // 81 C281 C382C281 C3->8x->C2->xx // 98 CB9C C38BC593 C3->8x->C5->xx // C3 C383 C383C692 C3->8x->C6->xx // C8 C388 C383CB86 C3->8x->CB->xx // 83 C692 C386E28099 C3->8x->E2->xx->8x // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx // // We also want to detect bare-byte extra UTF-8 conversions: // zero one two pattern // ---- ------ ---------------- ----------------- // C3 C3 C383 C3->8x->C2->xx // D3 D3 C393 C3->9x->C2->xx->C2->xx // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx // /** CP1252 => UTF8 => UTF8UTF8 80 => E282AC => C3A2E2809AC2AC 81 => C281 => C382C281 82 => E2809A => C3A2E282ACC5A1 83 => C692 => C386E28099 84 => E2809E => C3A2E282ACC5BE 85 => E280A6 => C3A2E282ACC2A6 86 => E280A0 => C3A2E282ACC2A0 87 => E280A1 => C3A2E282ACC2A1 88 => CB86 => C38BE280A0 89 => E280B0 => C3A2E282ACC2B0 8A => C5A0 => C385C2A0 8B => E280B9 => C3A2E282ACC2B9 8C => C592 => C385E28099 8D => C28D => C382C28D 8E => C5BD => C385C2BD 8F => C28F => C382C28F 90 => C290 => C382C290 91 => E28098 => C3A2E282ACCB9C 92 => E28099 => C3A2E282ACE284A2 93 => E2809C => C3A2E282ACC593 94 => E2809D => C3A2E282ACC29D 95 => E280A2 => C3A2E282ACC2A2 96 => E28093 => C3A2E282ACE2809C 97 => E28094 => C3A2E282ACE2809D 98 => CB9C => C38BC593 99 => E284A2 => C3A2E2809EC2A2 9A => C5A1 => C385C2A1 9B => E280BA => C3A2E282ACC2BA 9C => C593 => C385E2809C 9D => C29D => C382C29D 9E => C5BE => C385C2BE 9F => C5B8 => C385C2B8 A0 => C2A0 => C382C2A0 A1 => C2A1 => C382C2A1 A2 => C2A2 => C382C2A2 A3 => C2A3 => C382C2A3 A4 => C2A4 => C382C2A4 A5 => C2A5 => C382C2A5 A6 => C2A6 => C382C2A6 A7 => C2A7 => C382C2A7 A8 => C2A8 => C382C2A8 A9 => C2A9 => C382C2A9 AA => C2AA => C382C2AA AB => C2AB => C382C2AB AC => C2AC => C382C2AC AD => C2AD => C382C2AD AE => C2AE => C382C2AE AF => C2AF => C382C2AF B0 => C2B0 => C382C2B0 B1 => C2B1 => C382C2B1 B2 => C2B2 => C382C2B2 B3 => C2B3 => C382C2B3 B4 => C2B4 => C382C2B4 B5 => C2B5 => C382C2B5 B6 => C2B6 => C382C2B6 B7 => C2B7 => C382C2B7 B8 => C2B8 => C382C2B8 B9 => C2B9 => C382C2B9 BA => C2BA => C382C2BA BB => C2BB => C382C2BB BC => C2BC => C382C2BC BD => C2BD => C382C2BD BE => C2BE => C382C2BE BF => C2BF => C382C2BF C0 => C380 => C383E282AC C1 => C381 => C383C281 C2 => C382 => C383E2809A C3 => C383 => C383C692 C4 => C384 => C383E2809E C5 => C385 => C383E280A6 C6 => C386 => C383E280A0 C7 => C387 => C383E280A1 C8 => C388 => C383CB86 C9 => C389 => C383E280B0 CA => C38A => C383C5A0 CB => C38B => C383E280B9 CC => C38C => C383C592 CD => C38D => C383C28D CE => C38E => C383C5BD CF => C38F => C383C28F D0 => C390 => C383C290 D1 => C391 => C383E28098 D2 => C392 => C383E28099 D3 => C393 => C383E2809C D4 => C394 => C383E2809D D5 => C395 => C383E280A2 D6 => C396 => C383E28093 D7 => C397 => C383E28094 D8 => C398 => C383CB9C D9 => C399 => C383E284A2 DA => C39A => C383C5A1 DB => C39B => C383E280BA DC => C39C => C383C593 DD => C39D => C383C29D DE => C39E => C383C5BE DF => C39F => C383C5B8 E0 => C3A0 => C383C2A0 E1 => C3A1 => C383C2A1 E2 => C3A2 => C383C2A2 E3 => C3A3 => C383C2A3 E4 => C3A4 => C383C2A4 E5 => C3A5 => C383C2A5 E6 => C3A6 => C383C2A6 E7 => C3A7 => C383C2A7 E8 => C3A8 => C383C2A8 E9 => C3A9 => C383C2A9 EA => C3AA => C383C2AA EB => C3AB => C383C2AB EC => C3AC => C383C2AC ED => C3AD => C383C2AD EE => C3AE => C383C2AE EF => C3AF => C383C2AF F0 => C3B0 => C383C2B0 F1 => C3B1 => C383C2B1 F2 => C3B2 => C383C2B2 F3 => C3B3 => C383C2B3 F4 => C3B4 => C383C2B4 F5 => C3B5 => C383C2B5 F6 => C3B6 => C383C2B6 F7 => C3B7 => C383C2B7 F8 => C3B8 => C383C2B8 F9 => C3B9 => C383C2B9 FA => C3BA => C383C2BA FB => C3BB => C383C2BB FC => C3BC => C383C2BC FD => C3BD => C383C2BD FE => C3BE => C383C2BE FF => C3BF => C383C2BF **/ // Subscripted by <state, f(byte1) + g(byte2)> // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. // 81 C281 C382C281 C3->8x->C2->xx // 98 CB9C C38BC593 C3->8x->C5->xx // C3 C383 C383C692 C3->8x->C6->xx // C8 C388 C383CB86 C3->8x->CB->xx // [0] [2] [0] // 83 C692 C386E28099 C3->8x->E2->xx->xx // odd_byte=0 [0] [2] [0+] odd_byte flipped // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx // odd_byte=0 [0] [3] [4] [0+] // odd_byte=1 [0+] [3] [4] [4] [0] // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx // odd_byte=0 [0] [3] [4] [0] [0] // odd_byte=1 [0+] [3] [4] [4] [0+] // // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip // the odd_byte state. If that goes from 0 to 1, the next pair is offset up // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx. // These are absorbed with no error in state 0 or state 4 // // C3 C3 C383 C3->8x->C2->xx // D3 D3 C393 C3->9x->C2->xx->C2->xx // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx // Counter3 for Fx Ex sequences is incremented at last C2 static const char kMiniUTF8UTF8State[8][16] = …; // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B static const char kMiniUTF8UTF8Count[8][16] = …; static const char kMiniUTF8UTF8Odd[8][16] = …; // Turn a pair of bytes into the subscript for UTF8UTF8 tables above int UTF88Sub(char s0, char s1) { … } // Default probability for an encoding rankedencoding // Based on a scan of 55M web pages // These values are 255 - log base 2**1/10 (occurrences / total) // Large values are most likely. This the reverse of some Google code // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M) // // TODO change this to be per encoding, not permuted // // Support function for unit test program // Return ranked encoding corresponding to enc // (also exported to compact_enc_det_text.cc) int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) { … } string DecodeActive(uint32 active) { … } static inline bool SevenBitEncoding(int enc) { … } static inline bool TwoByteEncoding(int enc) { … } static inline bool IndicEncoding(int enc) { … } static inline bool HighAlphaEncoding(int enc) { … } static inline bool HighAccentEncoding(int enc) { … } static inline bool AnyActive(DetectEncodingState* destatep) { … } static inline bool SevenBitActive(DetectEncodingState* destatep) { … } static inline bool HzActive(DetectEncodingState* destatep) { … } static inline bool Iso2022Active(DetectEncodingState* destatep) { … } static inline bool UTF8Active(DetectEncodingState* destatep) { … } static inline bool UTF8UTF8Active(DetectEncodingState* destatep) { … } static inline bool UTF1632Active(DetectEncodingState* destatep) { … } static inline bool BinaryActive(DetectEncodingState* destatep) { … } static inline bool UTF7OrHzActive(DetectEncodingState* destatep) { … } static inline bool EUCJPActive(DetectEncodingState* destatep) { … } static inline bool OtherActive(DetectEncodingState* destatep) { … } static inline bool CEDFlagRescanning(CEDInternalFlags flags) { … } static inline bool CEDFlagForceTags(CEDInternalFlags flags) { … } static inline int maxint(int a, int b) { … } static inline int minint(int a, int b) { … } static inline const char* MyRankedEncName(int r_enc) { … } // Only for debugging. not thread safe static const int kPsSourceWidth = …; static int pssourcenext = …; // debug only. not threadsafe. dump only >= this static int pssourcewidth = …; // debug only. static char* pssource_mark_buffer = …; int next_do_src_line; int do_src_offset[16]; void PsSourceInit(int len) { … } void PsSourceFinish() { … } // Dump aligned len bytes src... if not already dumped void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) { … } // Mark bytes in just-previous source bytes void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) { … } // Highlight trigram bytes in just-previous source bytes // Unfortunately, we have to skip back N lines since source was printed for // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) { … } void InitDetectEncodingState(DetectEncodingState* destatep) { … } // Probability strings are uint8, with zeros removed via simple run-length: // (<skip-take byte> <data bytes>)* // skip-take: // 00 end // x0 skip 16 x locations, take 0 data values // xy skip x locations, take y data values // Multiply all the incoming values by 3 to account for 3x unigram sums // // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35, // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255" // // Weight is 0..100 percent // // Returns subscript of largest (most probable) value // // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__" // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASCII-7-bit] int ApplyCompressedProb(const char* iprob, int len, int weight, DetectEncodingState* destatep) { … } // Returns subscript of largest (most probable) value [for unit test] int TopCompressedProb(const char* iprob, int len) { … } // Find subscript of matching key in first 8 bytes of sorted hint array, or -1 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize, const char* norm_key) { … } // Find subscript of matching key in first 4 bytes of sorted hint array, or -1 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize, const char* norm_key) { … } static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) { … } static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) { … } // Apply initial probability hint based on top level domain name // Weight is 0..100 percent // Return 1 if name match found int ApplyTldHint(const char* url_tld_hint, int weight, DetectEncodingState* destatep) { … } // Apply initial probability hint based on charset= name // Weight is 0..100 percent // Return 1 if name match found int ApplyCharsetHint(const char* charset_hint, int weight, DetectEncodingState* destatep) { … } // Apply initial probability hint based on caller-supplied encoding // Negative hint whacks ~encoding, non-negative boosts encoding // // Negative hints are an experiment to see if they might be useful. // Not operator used instead of unary minus to allow specifying not-zero int ApplyEncodingHint(const int encoding_hint, int weight, DetectEncodingState* destatep) { … } // Apply initial probability hint based on user interface language // Weight is 0..100 percent // Return 1 if name match found int ApplyUILanguageHint(const Language language_hint, int weight, DetectEncodingState* destatep) { … } // Apply initial probability hint based on corpus type (web, email, etc) // Return 1 if name match found int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type, DetectEncodingState* destatep) { … } // Do reverse search for c in [str..str+len) // Note: initial pointer is to FRONT of string, not back const char* MyMemrchr(const char* str, char c, size_t len) { … } // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD // Now that we are no longer trying to do Indic font-based encodigns, we // don't need the full URL and can go back to simple TLD. This test remains for // backwards compatility with any caller using full URL. static const int kMinURLLength = …; // Extract TLD from a full URL or just a TLD // Return hostname and length if a full URL void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len, const char** ret_host_start, int* ret_host_len) { … } // Apply hints, if any, to probabilities // NOTE: Encoding probabilites are all zero at this point void ApplyHints(const char* url_hint, const char* http_charset_hint, const char* meta_charset_hint, const int encoding_hint, const Language language_hint, const CompactEncDet::TextCorpusType corpus_type, DetectEncodingState* destatep) { … } // Look for specific high-value patterns in the first 4 bytes // Byte order marks (BOM) // EFBBBF UTF-8 // FEFF UTF-16 BE // FFFE UTF-16 LE // FFFE0000 UTF-32 BE // 0000FEFF UTF-32 LE // // Likely UTF-x of seven-bit ASCII // 00xx UTF-16 BE xx printable ASCII // xx00 UTF-16 LE // 000000xx UTF-32 BE // xx000000 UTF-32 LE // void InitialBytesBoost(const uint8* src, int text_length, DetectEncodingState* destatep) { … } // Descending order int IntCompare(const void* v1, const void* v2) { … } bool Base64Char(uint8 c) { … } int Base64ScanLen(const uint8* start, const uint8* limit) { … } // Input is at least 8-character legal base64 string after +. // But might be say + "Presse+Termine" bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) { … } // Prune here after N bytes // Boost here for seven-bit sequences (at every prune) // if (sevenbitrankedencoding) // + UTF7 scan and boost/demote len mod 8 = 0 3 6 // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6 // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6 // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6 // [0F 2022 boost/demote] // 00 UTF16/32 scan and boost/demote offset = even/odd // // If still some seven-bit possibilities > pure ASCII, // scan each possibility for clearer prob, s.t. about // two good sequences is a clear win // A-Z 00-19 00xx-64xx (B = 04xx) // a-z 1A-33 68xx-CCxx (f = 7Cxx) // 0-9 34-3D D0xx-F4xx (1 = D4xx) // + 3E F8xx // / 3F FCxx // do another chunk with slow scan // Boost, whack, or leave alone UTF-7 probablilty void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) { … } // Boost, whack, or leave alone HZ probablilty void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) { … } // Boost, whack, or leave alone BINARY probablilty void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … } // Demote UTF-16/32 on 0000 or FFFF, favoring Binary void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) { … } // Make even offset void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) { … } bool ConsecutivePair(DetectEncodingState* destatep, int i) { … } // boost, whack, or leave alone UTF-8 probablilty // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8 // Returns total boost int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) { … } // Boost, whack, or leave alone UTF8UTF8 probablilty // // We are looking for // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the // MS CP1252 mappings, and // (2) sequences of 2 or more such characters // // If so, we could be looking at some non-7-bit encoding extra-converted // to UTF-8. The most common observed is CP1252->UTF8 twice, // 1252=>UTF8 : 1252=>UTF8 // where the colon means "take those bytes and pretend that they are 1252". // We have a couple of examples of BIG5 bytes converted as though // they were 1252, // BIG5 : 1252=>UTF8 // // Of course, we don't want correctly converted 1252 to be flagged here // 1252=>UTF8 // So we want the input high bytes to be in pairs or longer, hence the // output UTF8 in groups of four bytes or more // // Good chars: C2xx, C3xx, // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C // Good chars: E280xx E282AC E284A2 // C2xx 1100001x 10xxxxxx (128/128) // C5xx 11000101 10xx00xx (16/4) // C5xx 11000101 10111xxx (8/3) // C692 11000110 10010010 (1/1) // CBxx 11001011 100xx1x0 (8/2) // E28x 11100010 10000xx0 (4/3) // // Returns total boost int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) { … } // We give a gentle boost for each paired SO ... SI, whack others void CheckIso2022ActiveSeq(DetectEncodingState* destatep) { … } // We give a gentle boost for each paired ~{ ... ~}, whack others void CheckHzActiveSeq(DetectEncodingState* destatep) { … } // We give a gentle boost after an odd number of 8Fxxxx triples, which // put subsequent bigrams out of phase until a low byte or another 8Fxxxx void CheckEucJpSeq(DetectEncodingState* destatep) { … } // Boost, whack, or leave alone BINARY probablilty // Also called if UTF 16/32 active void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep, int delta_otherpairs) { … } // Look at a number of special-case encodings whose reliable detection depends // on sequencing or other properties // AsciiPair probibilities (UTF7 and HZ) are all done here void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) { … } void PrintTopEnc(DetectEncodingState* destatep, int n) { … } // If the same bigram repeats, don't boost its best encoding too much bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … } // Sometimes illegal bytes are used as markers between text that Javascript // is going to decode. Don't overboost the Binary encoding for markers 01-FF. // Just count first pair per 8x4 bucket bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { … } // Find current top two rankedencoding probabilities void ReRank(DetectEncodingState* destatep) { … } void SimplePrune(DetectEncodingState* destatep, int prune_diff) { … } // Recalculate reliable void CalcReliable(DetectEncodingState* destatep) { … } // Find current top two rankedencoding probabilities void FindTop2(DetectEncodingState* destatep, int* first_renc, int* second_renc, int* first_prob, int* second_prob) { … } void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) { … } // Map unencoded bytes down to five bits, largely preserving letters // This design struggles to put 33 values into 5 bits. #define XX … #define HA … #define HE … #define HI … #define HO … #define HU … #define Hc … static const char kMapToFiveBits[256] = …; #undef XX #undef HA #undef HE #undef HI #undef HO #undef HU #undef Hc static const int kTriLatin1Likely = …; static const int kTriLatin2Likely = …; static const int kTriLatin7Likely = …; // Each table entry has 32 times two bits, selected by byte[2] // Entry subscript is selected by byte[0] and byte[1] // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc static const uint64 kLatin127Trigrams[1024] = …; // Latin1 6%, Latin2 11%, Latin7 3% // Just for debugging. not thread-safe static char tri_string[4]; char* Latin127Str(int trisub) { … } // Returns two bits per three-byte trigram, indicating // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely int TrigramValue(const uint8* trisrc) { … } // Put out trigrams for surrounding 32 bytes for Latin encodings // Return true if more Latin2 & 7 than Latin1 bool BoostLatin127Trigrams(int tri_block_offset, DetectEncodingState* destatep) { … } // Boost any encodings that need extra detection help, then prune // src is first unscanned byte // slowend means extra pruning when dropping out of initial slow scan // final means last call -- no bigram at src void BoostPrune(const uint8* src, DetectEncodingState* destatep, int prunereason) { … } // Accumulate aligned byte-pair at src // Occasionally, calc boost for some encodings and then prune the active list // weightshift is used to give low weight some text, such as inside tags // Returns true if pruning occurred bool IncrementAndBoostPrune(const uint8* src, int remaining_length, DetectEncodingState* destatep, int weightshift, int exit_reason) { … } void DumpSummary(DetectEncodingState* destatep, int whatset, int n) { … } void BeginDetail(DetectEncodingState* destatep) { … } // Single character to represent (printable ASCII) gap between bigrams char DetailOffsetChar(int delta) { … } void DumpDetail(DetectEncodingState* destatep) { … } void PsRecurse(const char* buff) { … } void DumpReliable(DetectEncodingState* destatep) { … } // Scan short single lines quickly for all printable ASCII // Return true if all bytes are in [20..7F], false otherwise bool QuickPrintableAsciiScan(const char* text, int text_length) { … } static const int kMaxScanBack = …; // Return true if text is inside a tag or JS comment bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) { … } const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) { … } // Take a watch string and map to a ranked encoding. If no match, return -1 int LookupWatchEnc(const string& watch_str) { … } // Return true if enc and enc2 are equal or one is a subset of the other // or either is UNKNOWN // also UTF8UTF8 is compatible with both Latin1 and UTF8 bool CompatibleEnc(Encoding enc, Encoding enc2) { … } // Return superset of enc and enc2, which must be compatible Encoding SupersetEnc(Encoding enc, Encoding enc2) { … } // If unreliable, try rescoring to separate some encodings Encoding Rescore(Encoding enc, const uint8* isrc, const uint8* srctextlimit, DetectEncodingState* destatep) { … } // Given an encoding, add its corresponding ranked encoding to the set void AddToSet(Encoding enc, int* list_len, int* list) { … } static const int kMinRobustBigramCount = …; static const int kMinKBToRobustScan = …; static const int kMaxKBToRobustScan = …; // Scan the first 64K or so, just doing raw bigram increments on given // probability list. // No fancy duplicate filtering or anything else here. // Returns number of bigrams counted int RobustScan(const char* text, int text_length, int robust_renc_list_len, int* robust_renc_list, int* robust_renc_probs) { … } // If unreliable, rescan middle of document to see if we can get a better // answer. Rescan is only worthwhile if there are ~200 bytes or more left, // since the detector takes as much as 96 bytes of bigrams to decide. Encoding Rescan(Encoding enc, const uint8* isrc, const uint8* src, const uint8* srctextlimit, const char* url_hint, const char* http_charset_hint, const char* meta_charset_hint, const int encoding_hint, const Language language_hint, const CompactEncDet::TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, DetectEncodingState* destatep) { … } // With no hints at all, and perhaps on rescan, we relax our pickiness // and go ahead and accept the top multibyte encodings, even though // strictly their web pages should have declared an explicit encoding to // avoid the HTML standard's default ISO-8859-1. bool NoHintsCloseEnoughCompatible(Encoding top_enc) { … } // Scan raw bytes and detect most likely encoding // Design goals: // Skip over big initial stretches of seven-bit ASCII bytes very quickly // Thread safe // Works equally well on // 50-byte queries, // 5000-byte email and // 50000-byte web pages // Length 0 input returns ISO_8859_1 (ASCII) encoding // Setting ignore_7bit_mail_encodings effectively turns off detection of // UTF-7, HZ, and ISO-2022-xx Encoding InternalDetectEncoding( CEDInternalFlags flags, const char* text, int text_length, const char* url_hint, const char* http_charset_hint, const char* meta_charset_hint, const int encoding_hint, const Language language_hint, // User interface lang const CompactEncDet::TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable, Encoding* second_best_enc) { … } Encoding CompactEncDet::DetectEncoding( const char* text, int text_length, const char* url_hint, const char* http_charset_hint, const char* meta_charset_hint, const int encoding_hint, const Language language_hint, // User interface lang const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable) { … } // Return top encoding hint for given string Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) { … } // Return top encoding hint for given string Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) { … } // Return top encoding hint for given string Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) { … } const char* CompactEncDet::Version(void) { … }