// Copyright 2011 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40284755): Remove this and spanify to fix the errors. #pragma allow_unsafe_buffers #endif #include "base/i18n/icu_string_conversions.h" #include <math.h> #include <stdarg.h> #include <stddef.h> #include <limits> #include <sstream> #include "base/check_op.h" #include "base/format_macros.h" #include "base/strings/stringprintf.h" #include "base/strings/utf_string_conversions.h" #include "build/build_config.h" #include "testing/gtest/include/gtest/gtest.h" namespace base { namespace { // Given a null-terminated string of wchar_t with each wchar_t representing // a UTF-16 code unit, returns a std::u16string made up of wchar_t's in the // input. Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) // should be represented as a surrogate pair (two UTF-16 units) // *even* where wchar_t is 32-bit (Linux and Mac). // // This is to help write tests for functions with std::u16string params until // the C++ 0x UTF-16 literal is well-supported by compilers. std::u16string BuildString16(const wchar_t* s) { … } } // namespace // kConverterCodepageCases is not comprehensive. There are a number of cases // to add if we really want to have a comprehensive coverage of various // codepages and their 'idiosyncrasies'. Currently, the only implementation // for CodepageTo* and *ToCodepage uses ICU, which has a very extensive // set of tests for the charset conversion. So, we can get away with a // relatively small number of cases listed below. // // Note about |u16_wide| in the following struct. // On Windows, the field is always identical to |wide|. On Mac and Linux, // it's identical as long as there's no character outside the // BMP (<= U+FFFF). When there is, it is different from |wide| and // is not a real wide string (UTF-32 string) in that each wchar_t in // the string is a UTF-16 code unit zero-extended to be 32-bit // even when the code unit belongs to a surrogate pair. // For instance, a Unicode string (U+0041 U+010000) is represented as // L"\x0041\xD800\xDC00" instead of L"\x0041\x10000". // To avoid the clutter, |u16_wide| will be set to NULL // if it's identical to |wide| on *all* platforms. static const struct { … } kConvertCodepageCases[] = …; TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { … } static const struct { … } kConvertAndNormalizeCases[] = …; TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) { … } } // namespace base