encodings.cc | Explore in Territory

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

#include "util/encodings/encodings.h"

#include <string.h>                     // for strcasecmp
#include <unordered_map>
#include <utility>                      // for pair

#include "util/basictypes.h"
#include "util/string_util.h"
#include "util/case_insensitive_hash.h"

struct EncodingInfo { … };

static const EncodingInfo kEncodingInfoTable[] = …;



COMPILE_ASSERT(…);

Encoding default_encoding() { … }

// *************************************************************
// Encoding predicates
//   IsValidEncoding()
//   IsEncEncCompatible
//   IsEncodingWithSupportedLanguage
//   IsSupersetOfAscii7Bit
//   Is8BitEncoding
//   IsCJKEncoding
//   IsHebrewEncoding
//   IsRightToLeftEncoding
//   IsLogicalRightToLeftEncoding
//   IsVisualRightToLeftEncoding
//   IsIso2022Encoding
//   IsIso2022JpOrVariant
//   IsShiftJisOrVariant
//   IsJapaneseCellPhoneCarrierSpecificEncoding
// *************************************************************

bool IsValidEncoding(Encoding enc) { … }

bool IsEncEncCompatible(const Encoding from, const Encoding to) { … }

// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
// encoding represent the same characters as they do in ISO_8859_1.

// TODO: This list could be expanded.  Many other encodings are supersets
// of 7-bit Ascii.  In fact, Japanese JIS and Unicode are the only two
// encodings that I know for a fact should *not* be in this list.
bool IsSupersetOfAscii7Bit(Encoding e) { … }

// To be an 8-bit encoding means that there are fewer than 256 symbols.
// Each byte determines a new character; there are no multi-byte sequences.

// TODO: This list could maybe be expanded.  Other encodings may be 8-bit.
bool Is8BitEncoding(Encoding e) { … }

bool IsCJKEncoding(Encoding e) { … }

bool IsHebrewEncoding(Encoding e) { … }



bool IsRightToLeftEncoding(Encoding enc) { … }

bool IsLogicalRightToLeftEncoding(Encoding enc) { … }

// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
// is NOT visual.
bool IsVisualRightToLeftEncoding(Encoding enc) { … }





bool IsIso2022Encoding(Encoding enc) { … }

bool IsIso2022JpOrVariant(Encoding enc) { … }

bool IsShiftJisOrVariant(Encoding enc) { … }

bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { … }


// *************************************************************
// ENCODING NAMES
//   EncodingName() [Encoding to name]
//   MimeEncodingName() [Encoding to name]
//   EncodingFromName() [name to Encoding]
//   EncodingNameAliasToEncoding() [name to Encoding]
//   default_encoding_name()
//   invalid_encoding_name()
// *************************************************************

const char * EncodingName(const Encoding enc) { … }

// TODO: Unify MimeEncodingName and EncodingName, or determine why
// such a unification is not possible.

const char * MimeEncodingName(Encoding enc) { … }

bool EncodingFromName(const char* enc_name, Encoding *encoding) { … }

// The encoding_map maps standard and non-standard encoding-names
// (strings) to Encoding enums. It is used only by
// EncodingNameAliasToEncoding. Note that the map uses
// case-insensitive hash and comparison functions.

EncodingMap;

static const EncodingMap& GetEncodingMap() { … }

// ----------------------------------------------------------------------
// EncodingNameAliasToEncoding()
//
// This function takes an encoding name/alias and returns the Encoding
// enum. The input is case insensitive. It is the union of the common
// IANA standard names, the charset names used in Netscape Navigator,
// and some common names we have been using.
// See: http://www.iana.org/assignments/character-sets
// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
//
// UNKNOWN_ENCODING is returned if none matches.
//
// TODO: Check if it is possible to remove the non-standard,
// non-netscape-use names. It is because this routine is used for
// encoding detections from html meta info. Non-standard names may
// introduce noise on encoding detection.
//
// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
// or determine why such a unification is not possible.
// ----------------------------------------------------------------------
Encoding EncodingNameAliasToEncoding(const char *encoding_name) { … }

const char* default_encoding_name() { … }

static const char* const kInvalidEncodingName = …;

const char *invalid_encoding_name() { … }



// *************************************************************
// Miscellany
// *************************************************************


Encoding PreferredWebOutputEncoding(Encoding enc) { … }
chromium/third_party/ced/src/util/encodings/encodings.cc