folly/folly/Unicode.h

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Some utility routines relating to Unicode.

#pragma once

#include <cstdint>
#include <stdexcept>
#include <string>

#include <folly/lang/Exception.h>

namespace folly {

class FOLLY_EXPORT unicode_error : public std::runtime_error {};

//  Unicode code points are split into 17 planes.
//
//  The Basic Multilingual Plane covers code points in [0-0xFFFF] but reserves
//  two invalid ranges:
//  - High surrogates: [0xD800-0xDBFF].
//  - Low surrogates: [0xDC00-0xDFFF].
//
//  UTF-16 code units are 2 bytes wide and are represented here with char16_t.
//  Unicode code points are represented in UTF-16 across either 1-2 code units:
//  - Valid BMP code points [0x0000-0xD7FF] + [0xE000-0xFFFF] are encoded
//    directly as 1 code unit.
//  - Code points larger than BMP (>0xFFFF) are encoded as 2 code units, with
//    values respectively in the high surrogates and low surrogates ranges.
//
//  JSON text permits the inclusion of Unicode escape sequences within quoted
//  strings:
//  - Valid BMP code points are encoded as \xXXXX, where XXXX are the base-16
//    digits of the code point.
//  - Code points larger than BMP are encoded as \uHHHH\uLLLL, where HHHH and
//    LLLL are respectively the base-16 digits of the high and low surrogates of
//    the UTF-16 encoding of the code point.

inline bool utf16_code_unit_is_bmp(char16_t const c) {}
inline bool utf16_code_unit_is_high_surrogate(char16_t const c) {}
inline bool utf16_code_unit_is_low_surrogate(char16_t const c) {}
inline char32_t unicode_code_point_from_utf16_surrogate_pair(
    char16_t const high, char16_t const low) {}

//////////////////////////////////////////////////////////////////////

/*
 * Encode a single Unicode code point into a UTF-8 byte sequence.
 *
 * Result is undefined if `cp' is an invalid code point.
 */
std::string codePointToUtf8(char32_t cp);
void appendCodePointToUtf8(char32_t cp, std::string& out);

/*
 * Decode a single Unicode code point from UTF-8 byte sequence.
 */
char32_t utf8ToCodePoint(
    const unsigned char*& p, const unsigned char* const e, bool skipOnError);

//////////////////////////////////////////////////////////////////////

} // namespace folly