utf8.cc | Explore in Territory

/*
 * Copyright (C) 2007 Apple Inc.  All rights reserved.
 * Copyright (C) 2010 Patrick Gansterer <[email protected]>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "third_party/blink/renderer/platform/wtf/text/utf8.h"

#include <unicode/utf16.h>

#include "base/check.h"
#include "base/not_fatal_until.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/string_hasher.h"

namespace WTF {
namespace unicode {

inline int InlineUTF8SequenceLengthNonASCII(char b0) { … }

inline int InlineUTF8SequenceLength(char b0) { … }

// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
// into the first byte, depending on how many bytes follow.  There are
// as many entries in this table as there are UTF-8 sequence types.
// (I.e., one byte sequence, two byte... etc.). Remember that sequences
// for *legal* UTF-8 will be 4 or fewer bytes total.
static const unsigned char kFirstByteMark[7] = …;

ConversionResult ConvertLatin1ToUTF8(const LChar** source_start,
                                     const LChar* source_end,
                                     char** target_start,
                                     char* target_end) { … }

ConversionResult ConvertUTF16ToUTF8(const UChar** source_start,
                                    const UChar* source_end,
                                    char** target_start,
                                    char* target_end,
                                    bool strict) { … }

// This must be called with the length pre-determined by the first byte.
// If presented with a length > 4, this returns false.  The Unicode
// definition of UTF-8 goes up to 4-byte sequences.
static bool IsLegalUTF8(const unsigned char* source, int length) { … }

// Magic values subtracted from a buffer value during UTF8 conversion.
// This table contains as many values as there might be trailing bytes
// in a UTF-8 sequence.
static const UChar32 kOffsetsFromUTF8[6] = …;

static inline UChar32 ReadUTF8Sequence(const char*& sequence, unsigned length) { … }

ConversionResult ConvertUTF8ToUTF16(const char** source_start,
                                    const char* source_end,
                                    UChar** target_start,
                                    UChar* target_end,
                                    bool strict) { … }

unsigned CalculateStringLengthFromUTF8(const char* data,
                                       const char*& data_end,
                                       bool& seen_non_ascii,
                                       bool& seen_non_latin1) { … }

template <typename CharType>
ALWAYS_INLINE bool EqualWithUTF8Internal(const CharType* a,
                                         const CharType* a_end,
                                         const char* b,
                                         const char* b_end) { … }

bool EqualUTF16WithUTF8(const UChar* a,
                        const UChar* a_end,
                        const char* b,
                        const char* b_end) { … }

bool EqualLatin1WithUTF8(const LChar* a,
                         const LChar* a_end,
                         const char* b,
                         const char* b_end) { … }

}  // namespace unicode
}  // namespace WTF
chromium/third_party/blink/renderer/platform/wtf/text/utf8.cc