unicode.h | Explore in Territory

// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCPP___FORMAT_UNICODE_H
#define _LIBCPP___FORMAT_UNICODE_H

#include <__assert>
#include <__bit/countl.h>
#include <__concepts/same_as.h>
#include <__config>
#include <__format/extended_grapheme_cluster_table.h>
#include <__format/indic_conjunct_break_table.h>
#include <__iterator/concepts.h>
#include <__iterator/readable_traits.h> // iter_value_t
#include <__utility/unreachable.h>
#include <string_view>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
#  pragma GCC system_header
#endif

_LIBCPP_BEGIN_NAMESPACE_STD

#if _LIBCPP_STD_VER >= 20

namespace __unicode {

// Helper struct for the result of a consume operation.
//
// The status value for a correct code point is 0. This allows a valid value to
// be used without masking.
// When the decoding fails it know the number of code units affected. For the
// current use-cases that value is not needed, therefore it is not stored.
// The escape routine needs the number of code units for both a valid and
// invalid character and keeps track of it itself. Doing it in this result
// unconditionally would give some overhead when the value is unneeded.
struct __consume_result { … };
static_assert …;

#  ifndef _LIBCPP_HAS_NO_UNICODE

/// Implements the grapheme cluster boundary rules
///
/// These rules are used to implement format's width estimation as stated in
/// [format.string.std]/11
///
/// The Standard refers to UAX \#29 for Unicode 12.0.0
/// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
///
/// The data tables used are
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
/// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)

inline constexpr char32_t __replacement_character = …;

// The error of a consume operation.
//
// This sets the code point to the replacement character. This code point does
// not participate in the grapheme clustering, so grapheme clustering code can
// ignore the error status and always use the code point.
inline constexpr __consume_result __consume_result_error{ … };

[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) { … }

[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) { … }

// https://www.unicode.org/glossary/#surrogate_code_point
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) { … }

// https://www.unicode.org/glossary/#code_point
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) { … }

// https://www.unicode.org/glossary/#unicode_scalar_value
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) { … }

template <contiguous_iterator _Iterator>
  requires same_as<iter_value_t<_Iterator>, char>
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) { … }

/// Helper class to extract a code unit from a Unicode character range.
///
/// The stored range is a view. There are multiple specialization for different
/// character types.
template <class _CharT>
class __code_point_view;

/// UTF-8 specialization.
template <>
class __code_point_view<char> { … };

#    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) { … }

_LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) { … }

/// This specialization depends on the size of wchar_t
/// - 2 UTF-16 (for example Windows and AIX)
/// - 4 UTF-32 (for example Linux)
template <>
class __code_point_view<wchar_t> { … };
#    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS

// State machine to implement the Extended Grapheme Cluster Boundary
//
// The exact rules may change between Unicode versions.
// This implements the extended rules see
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
class __extended_grapheme_cluster_break { … };

/// Helper class to extract an extended grapheme cluster from a Unicode character range.
///
/// This function is used to determine the column width of an extended grapheme
/// cluster. In order to do that only the first code point is evaluated.
/// Therefore only this code point is extracted.
template <class _CharT>
class __extended_grapheme_cluster_view { … };

template <contiguous_iterator _Iterator>
__extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>;

#  else //  _LIBCPP_HAS_NO_UNICODE

// For ASCII every character is a "code point".
// This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
template <class _CharT>
class __code_point_view {
  using _Iterator = typename basic_string_view<_CharT>::const_iterator;

public:
  _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
      : __first_(__first), __last_(__last) {}

  _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }

  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
    _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
    return {static_cast<char32_t>(*__first_++)};
  }

private:
  _Iterator __first_;
  _Iterator __last_;
};

#  endif //  _LIBCPP_HAS_NO_UNICODE

} // namespace __unicode

#endif // _LIBCPP_STD_VER >= 20

_LIBCPP_END_NAMESPACE_STD

#endif // _LIBCPP___FORMAT_UNICODE_H
chromium/third_party/libc++/src/include/__format/unicode.h