// -*- C++ -*- //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef _LIBCPP___FORMAT_UNICODE_H #define _LIBCPP___FORMAT_UNICODE_H #include <__assert> #include <__bit/countl.h> #include <__concepts/same_as.h> #include <__config> #include <__format/extended_grapheme_cluster_table.h> #include <__format/indic_conjunct_break_table.h> #include <__iterator/concepts.h> #include <__iterator/readable_traits.h> // iter_value_t #include <__utility/unreachable.h> #include <string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 namespace __unicode { // Helper struct for the result of a consume operation. // // The status value for a correct code point is 0. This allows a valid value to // be used without masking. // When the decoding fails it know the number of code units affected. For the // current use-cases that value is not needed, therefore it is not stored. // The escape routine needs the number of code units for both a valid and // invalid character and keeps track of it itself. Doing it in this result // unconditionally would give some overhead when the value is unneeded. struct __consume_result { … }; static_assert …; # ifndef _LIBCPP_HAS_NO_UNICODE /// Implements the grapheme cluster boundary rules /// /// These rules are used to implement format's width estimation as stated in /// [format.string.std]/11 /// /// The Standard refers to UAX \#29 for Unicode 12.0.0 /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules /// /// The data tables used are /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only) inline constexpr char32_t __replacement_character = …; // The error of a consume operation. // // This sets the code point to the replacement character. This code point does // not participate in the grapheme clustering, so grapheme clustering code can // ignore the error status and always use the code point. inline constexpr __consume_result __consume_result_error{ … }; [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) { … } [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) { … } // https://www.unicode.org/glossary/#surrogate_code_point [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) { … } // https://www.unicode.org/glossary/#code_point [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) { … } // https://www.unicode.org/glossary/#unicode_scalar_value [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) { … } template <contiguous_iterator _Iterator> requires same_as<iter_value_t<_Iterator>, char> _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) { … } /// Helper class to extract a code unit from a Unicode character range. /// /// The stored range is a view. There are multiple specialization for different /// character types. template <class _CharT> class __code_point_view; /// UTF-8 specialization. template <> class __code_point_view<char> { … }; # ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) { … } _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) { … } /// This specialization depends on the size of wchar_t /// - 2 UTF-16 (for example Windows and AIX) /// - 4 UTF-32 (for example Linux) template <> class __code_point_view<wchar_t> { … }; # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS // State machine to implement the Extended Grapheme Cluster Boundary // // The exact rules may change between Unicode versions. // This implements the extended rules see // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries class __extended_grapheme_cluster_break { … }; /// Helper class to extract an extended grapheme cluster from a Unicode character range. /// /// This function is used to determine the column width of an extended grapheme /// cluster. In order to do that only the first code point is evaluated. /// Therefore only this code point is extracted. template <class _CharT> class __extended_grapheme_cluster_view { … }; template <contiguous_iterator _Iterator> __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>; # else // _LIBCPP_HAS_NO_UNICODE // For ASCII every character is a "code point". // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define. template <class _CharT> class __code_point_view { using _Iterator = typename basic_string_view<_CharT>::const_iterator; public: _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last) : __first_(__first), __last_(__last) {} _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; } [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept { _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input"); return {static_cast<char32_t>(*__first_++)}; } private: _Iterator __first_; _Iterator __last_; }; # endif // _LIBCPP_HAS_NO_UNICODE } // namespace __unicode #endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___FORMAT_UNICODE_H