// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // //////////////////////////////////////////////////////////////////////////////// #include "util/encodings/encodings.h" #include <string.h> // for strcasecmp #include <unordered_map> #include <utility> // for pair #include "util/basictypes.h" #include "util/string_util.h" #include "util/case_insensitive_hash.h" struct EncodingInfo { … }; static const EncodingInfo kEncodingInfoTable[] = …; COMPILE_ASSERT(…); Encoding default_encoding() { … } // ************************************************************* // Encoding predicates // IsValidEncoding() // IsEncEncCompatible // IsEncodingWithSupportedLanguage // IsSupersetOfAscii7Bit // Is8BitEncoding // IsCJKEncoding // IsHebrewEncoding // IsRightToLeftEncoding // IsLogicalRightToLeftEncoding // IsVisualRightToLeftEncoding // IsIso2022Encoding // IsIso2022JpOrVariant // IsShiftJisOrVariant // IsJapaneseCellPhoneCarrierSpecificEncoding // ************************************************************* bool IsValidEncoding(Encoding enc) { … } bool IsEncEncCompatible(const Encoding from, const Encoding to) { … } // To be a superset of 7-bit Ascii means that bytes 0...127 in the given // encoding represent the same characters as they do in ISO_8859_1. // TODO: This list could be expanded. Many other encodings are supersets // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two // encodings that I know for a fact should *not* be in this list. bool IsSupersetOfAscii7Bit(Encoding e) { … } // To be an 8-bit encoding means that there are fewer than 256 symbols. // Each byte determines a new character; there are no multi-byte sequences. // TODO: This list could maybe be expanded. Other encodings may be 8-bit. bool Is8BitEncoding(Encoding e) { … } bool IsCJKEncoding(Encoding e) { … } bool IsHebrewEncoding(Encoding e) { … } bool IsRightToLeftEncoding(Encoding enc) { … } bool IsLogicalRightToLeftEncoding(Encoding enc) { … } // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6) // is NOT visual. bool IsVisualRightToLeftEncoding(Encoding enc) { … } bool IsIso2022Encoding(Encoding enc) { … } bool IsIso2022JpOrVariant(Encoding enc) { … } bool IsShiftJisOrVariant(Encoding enc) { … } bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { … } // ************************************************************* // ENCODING NAMES // EncodingName() [Encoding to name] // MimeEncodingName() [Encoding to name] // EncodingFromName() [name to Encoding] // EncodingNameAliasToEncoding() [name to Encoding] // default_encoding_name() // invalid_encoding_name() // ************************************************************* const char * EncodingName(const Encoding enc) { … } // TODO: Unify MimeEncodingName and EncodingName, or determine why // such a unification is not possible. const char * MimeEncodingName(Encoding enc) { … } bool EncodingFromName(const char* enc_name, Encoding *encoding) { … } // The encoding_map maps standard and non-standard encoding-names // (strings) to Encoding enums. It is used only by // EncodingNameAliasToEncoding. Note that the map uses // case-insensitive hash and comparison functions. EncodingMap; static const EncodingMap& GetEncodingMap() { … } // ---------------------------------------------------------------------- // EncodingNameAliasToEncoding() // // This function takes an encoding name/alias and returns the Encoding // enum. The input is case insensitive. It is the union of the common // IANA standard names, the charset names used in Netscape Navigator, // and some common names we have been using. // See: http://www.iana.org/assignments/character-sets // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html // // UNKNOWN_ENCODING is returned if none matches. // // TODO: Check if it is possible to remove the non-standard, // non-netscape-use names. It is because this routine is used for // encoding detections from html meta info. Non-standard names may // introduce noise on encoding detection. // // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName, // or determine why such a unification is not possible. // ---------------------------------------------------------------------- Encoding EncodingNameAliasToEncoding(const char *encoding_name) { … } const char* default_encoding_name() { … } static const char* const kInvalidEncodingName = …; const char *invalid_encoding_name() { … } // ************************************************************* // Miscellany // ************************************************************* Encoding PreferredWebOutputEncoding(Encoding enc) { … }