icu_string_conversions_unittest.cc | Explore in Territory

// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
#pragma allow_unsafe_buffers
#endif

#include "base/i18n/icu_string_conversions.h"

#include <math.h>
#include <stdarg.h>
#include <stddef.h>

#include <limits>
#include <sstream>

#include "base/check_op.h"
#include "base/format_macros.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace base {

namespace {

// Given a null-terminated string of wchar_t with each wchar_t representing
// a UTF-16 code unit, returns a std::u16string made up of wchar_t's in the
// input. Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
// should be represented as a surrogate pair (two UTF-16 units)
// *even* where wchar_t is 32-bit (Linux and Mac).
//
// This is to help write tests for functions with std::u16string params until
// the C++ 0x UTF-16 literal is well-supported by compilers.
std::u16string BuildString16(const wchar_t* s) { … }

}  // namespace

// kConverterCodepageCases is not comprehensive. There are a number of cases
// to add if we really want to have a comprehensive coverage of various
// codepages and their 'idiosyncrasies'. Currently, the only implementation
// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive
// set of tests for the charset conversion. So, we can get away with a
// relatively small number of cases listed below.
//
// Note about |u16_wide| in the following struct.
// On Windows, the field is always identical to |wide|. On Mac and Linux,
// it's identical as long as there's no character outside the
// BMP (<= U+FFFF). When there is, it is different from |wide| and
// is not a real wide string (UTF-32 string) in that each wchar_t in
// the string is a UTF-16 code unit zero-extended to be 32-bit
// even when the code unit belongs to a surrogate pair.
// For instance, a Unicode string (U+0041 U+010000) is represented as
// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000".
// To avoid the clutter, |u16_wide| will be set to NULL
// if it's identical to |wide| on *all* platforms.

static const struct { … } kConvertCodepageCases[] = …;

TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { … }

static const struct { … } kConvertAndNormalizeCases[] = …;
TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) { … }

}  // namespace base
chromium/base/i18n/icu_string_conversions_unittest.cc