unicodeobject.c | Explore in Territory

/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <[email protected]>.

Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

Copyright (c) Corporation for National Research Initiatives.

--------------------------------------------------------------------
The original string type implementation is:

  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------

*/

#include "Python.h"
#include "pycore_abstract.h"      // _PyIndex_Check()
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
#include "pycore_codecs.h"        // _PyCodec_Lookup()
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
#include "pycore_format.h"        // F_LJUST
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
#include "pycore_initconfig.h"    // _PyStatus_OK()
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
#include "pycore_long.h"          // _PyLong_FormatWriter()
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()

#include "stringlib/eq.h"         // unicode_eq()
#include <stddef.h>               // ptrdiff_t

#ifdef MS_WINDOWS
#include <windows.h>
#endif

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
#endif

/* Uncomment to display statistics on interned strings at exit
   in _PyUnicode_ClearInterned(). */
/* #define INTERNED_STATS 1 */


/*[clinic input]
class str "PyObject *" "&PyUnicode_Type"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/

/*[python input]
class Py_UCS4_converter(CConverter):
    type = 'Py_UCS4'
    converter = 'convert_uc'

    def converter_init(self):
        if self.default is not unspecified:
            self.c_default = ascii(self.default)
            if len(self.c_default) > 4 or self.c_default[0] != "'":
                self.c_default = hex(ord(self.default))

[python start generated code]*/
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/

/* --- Globals ------------------------------------------------------------

NOTE: In the interpreter's initialization phase, some globals are currently
      initialized dynamically as needed. In the process Unicode objects may
      be created before the Unicode type is ready.

*/

// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
// The value must be the same in fileutils.c.
#define MAX_UNICODE …

#ifdef Py_DEBUG
#define _PyUnicode_CHECK …
#else
#define _PyUnicode_CHECK(op) …
#endif

#define _PyUnicode_UTF8(op) …
#define PyUnicode_UTF8(op) …
#define _PyUnicode_UTF8_LENGTH(op) …
#define PyUnicode_UTF8_LENGTH(op) …

#define _PyUnicode_LENGTH(op) …
#define _PyUnicode_STATE(op) …
#define _PyUnicode_HASH(op) …
#define _PyUnicode_KIND(op) …
#define _PyUnicode_GET_LENGTH(op) …
#define _PyUnicode_DATA_ANY(op) …

#define _PyUnicode_SHARE_UTF8(op) …

/* true if the Unicode object has an allocated UTF-8 memory block
   (not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) …

/* Generic helper macro to convert characters of different types.
   from_type and to_type have to be valid type names, begin and end
   are pointers to the source characters which should be of type
   "from_type *".  to is a pointer of type "to_type *" and points to the
   buffer where the result characters are written to. */
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) …

#define LATIN1 …

#ifdef MS_WINDOWS
   /* On Windows, overallocate by 50% is the best factor */
#define OVERALLOCATE_FACTOR …
#else
   /* On Linux, overallocate by 25% is the best factor */
#define OVERALLOCATE_FACTOR …
#endif

/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
                    const char *errors);
static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
                    _Py_error_handler error_handler, const char *errors,
                    Py_ssize_t *consumed);
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
                           _Py_error_handler error_handler, const char *errors,
                           Py_ssize_t *consumed);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
#endif


// Return a reference to the immortal empty string singleton.
static inline PyObject* unicode_get_empty(void)
{ … }

/* This dictionary holds per-interpreter interned strings.
 * See InternalDocs/string_interning.md for details.
 */
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
{ … }

/* This hashtable holds statically allocated interned strings.
 * See InternalDocs/string_interning.md for details.
 */
#define INTERNED_STRINGS …

/* Get number of all interned strings for the current interpreter. */
Py_ssize_t
_PyUnicode_InternedSize(void)
{ … }

/* Get number of immortal interned strings for the current interpreter. */
Py_ssize_t
_PyUnicode_InternedSize_Immortal(void)
{ … }

static Py_hash_t unicode_hash(PyObject *);

static Py_uhash_t
hashtable_unicode_hash(const void *key)
{ … }

static int
hashtable_unicode_compare(const void *key1, const void *key2)
{ … }

/* Return true if this interpreter should share the main interpreter's
   intern_dict.  That's important for interpreters which load basic
   single-phase init extension modules (m_size == -1).  There could be interned
   immortal strings that are shared between interpreters, due to the
   PyDict_Update(mdict, m_copy) call in import_find_extension().

   It's not safe to deallocate those strings until all interpreters that
   potentially use them are freed.  By storing them in the main interpreter, we
   ensure they get freed after all other interpreters are freed.
*/
static bool
has_shared_intern_dict(PyInterpreterState *interp)
{ … }

static int
init_interned_dict(PyInterpreterState *interp)
{ … }

static void
clear_interned_dict(PyInterpreterState *interp)
{ … }

static PyStatus
init_global_interned_strings(PyInterpreterState *interp)
{ … }

static void clear_global_interned_strings(void)
{ … }

#define _Py_RETURN_UNICODE_EMPTY() …

static inline void
unicode_fill(int kind, void *data, Py_UCS4 value,
             Py_ssize_t start, Py_ssize_t length)
{ … }


/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = …;

/* forward */
static PyObject* get_latin1_char(unsigned char ch);
static int unicode_modifiable(PyObject *unicode);


static PyObject *
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
static PyObject *
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
static PyObject *
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);

static PyObject *
unicode_encode_call_errorhandler(const char *errors,
       PyObject **errorHandler,const char *encoding, const char *reason,
       PyObject *unicode, PyObject **exceptionObject,
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);

static void
raise_encode_exception(PyObject **exceptionObject,
                       const char *encoding,
                       PyObject *unicode,
                       Py_ssize_t startpos, Py_ssize_t endpos,
                       const char *reason);

/* Same for linebreaks */
static const unsigned char ascii_linebreak[] = …;

static int convert_uc(PyObject *obj, void *addr);

struct encoding_map;
#include "clinic/unicodeobject.c.h"

_Py_error_handler
_Py_GetErrorHandler(const char *errors)
{ … }


static _Py_error_handler
get_error_handler_wide(const wchar_t *errors)
{ … }


static inline int
unicode_check_encoding_errors(const char *encoding, const char *errors)
{ … }


int
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
{ … }

static PyObject*
unicode_result(PyObject *unicode)
{ … }

static PyObject*
unicode_result_unchanged(PyObject *unicode)
{ … }

/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
   ASCII, Latin1, UTF-8, etc. */
static char*
backslashreplace(_PyBytesWriter *writer, char *str,
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
{ … }

/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
   ASCII, Latin1, UTF-8, etc. */
static char*
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
{ … }

/* --- Bloom Filters ----------------------------------------------------- */

/* stuff to implement simple "bloom filters" for Unicode characters.
   to keep things simple, we use a single bitmask, using the least 5
   bits from each unicode characters as the bit index. */

/* the linebreak mask is set up by _PyUnicode_Init() below */

#if LONG_BIT >= 128
#define BLOOM_WIDTH …
#elif LONG_BIT >= 64
#define BLOOM_WIDTH …
#elif LONG_BIT >= 32
#define BLOOM_WIDTH …
#else
#error "LONG_BIT is smaller than 32"
#endif

#define BLOOM_MASK …

static BLOOM_MASK bloom_linebreak = …;

#define BLOOM(mask, ch) …

#define BLOOM_LINEBREAK(ch) …

static inline BLOOM_MASK
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
{ … }

static int
ensure_unicode(PyObject *obj)
{ … }

/* Compilation of templated routines */

#define STRINGLIB_GET_EMPTY …

#include "stringlib/asciilib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

#include "stringlib/ucs1lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

#include "stringlib/ucs2lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

#include "stringlib/ucs4lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

#undef STRINGLIB_GET_EMPTY

/* --- Unicode Object ----------------------------------------------------- */

static inline Py_ssize_t
findchar(const void *s, int kind,
         Py_ssize_t size, Py_UCS4 ch,
         int direction)
{ … }

#ifdef Py_DEBUG
/* Fill the data of a Unicode string with invalid characters to detect bugs
   earlier.

   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
   invalid character in Unicode 6.0. */
static void
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
{
    int kind = PyUnicode_KIND(unicode);
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
    if (length <= old_length)
        return;
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
}
#endif

static PyObject*
resize_compact(PyObject *unicode, Py_ssize_t length)
{ … }

static int
resize_inplace(PyObject *unicode, Py_ssize_t length)
{ … }

static PyObject*
resize_copy(PyObject *unicode, Py_ssize_t length)
{ … }

static const char*
unicode_kind_name(PyObject *unicode)
{ … }

#ifdef Py_DEBUG
/* Functions wrapping macros for use in debugger */
const char *_PyUnicode_utf8(void *unicode_raw){
    PyObject *unicode = _PyObject_CAST(unicode_raw);
    return PyUnicode_UTF8(unicode);
}

const void *_PyUnicode_compact_data(void *unicode_raw) {
    PyObject *unicode = _PyObject_CAST(unicode_raw);
    return _PyUnicode_COMPACT_DATA(unicode);
}
const void *_PyUnicode_data(void *unicode_raw) {
    PyObject *unicode = _PyObject_CAST(unicode_raw);
    printf("obj %p\n", (void*)unicode);
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
    return PyUnicode_DATA(unicode);
}

void
_PyUnicode_Dump(PyObject *op)
{
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
    const void *data;

    if (ascii->state.compact)
    {
        if (ascii->state.ascii)
            data = (ascii + 1);
        else
            data = (compact + 1);
    }
    else
        data = unicode->data.any;
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);

    if (!ascii->state.ascii) {
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
    }
    printf(", data=%p\n", data);
}
#endif


PyObject *
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
{ … }

static int
unicode_check_modifiable(PyObject *unicode)
{ … }

static int
_copy_characters(PyObject *to, Py_ssize_t to_start,
                 PyObject *from, Py_ssize_t from_start,
                 Py_ssize_t how_many, int check_maxchar)
{ … }

void
_PyUnicode_FastCopyCharacters(
    PyObject *to, Py_ssize_t to_start,
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
{ … }

Py_ssize_t
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
                         PyObject *from, Py_ssize_t from_start,
                         Py_ssize_t how_many)
{ … }

/* Find the maximum code point and count the number of surrogate pairs so a
   correct string length can be computed before converting a string to UCS4.
   This function counts single surrogates as a character and not as a pair.

   Return 0 on success, or -1 on error. */
static int
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
{ … }

static void
unicode_dealloc(PyObject *unicode)
{ … }

#ifdef Py_DEBUG
static int
unicode_is_singleton(PyObject *unicode)
{
    if (unicode == &_Py_STR(empty)) {
        return 1;
    }

    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
    if (ascii->length == 1) {
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
        if (ch < 256 && LATIN1(ch) == unicode) {
            return 1;
        }
    }
    return 0;
}
#endif

static int
unicode_modifiable(PyObject *unicode)
{ … }

static int
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
{ … }

int
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
{ … }

/* Copy an ASCII or latin1 char* string into a Python Unicode string.

   WARNING: The function doesn't copy the terminating null character and
   doesn't check the maximum character (may write a latin1 character in an
   ASCII string). */
static void
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
                   const char *str, Py_ssize_t len)
{ … }

static PyObject*
get_latin1_char(Py_UCS1 ch)
{ … }

static PyObject*
unicode_char(Py_UCS4 ch)
{ … }


static inline void
unicode_write_widechar(int kind, void *data,
                       const wchar_t *u, Py_ssize_t size,
                       Py_ssize_t num_surrogates)
{ … }


PyObject *
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
{ … }


int
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
                              const wchar_t *str,
                              Py_ssize_t size)
{ … }


PyObject *
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{ … }

PyObject *
PyUnicode_FromString(const char *u)
{ … }


PyObject *
_PyUnicode_FromId(_Py_Identifier *id)
{ … }


static void
unicode_clear_identifiers(struct _Py_unicode_state *state)
{ … }


/* Internal function, doesn't check maximum character */

PyObject*
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
{ … }

static Py_UCS4
kind_maxchar_limit(int kind)
{ … }

static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
{ … }

static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
{ … }

static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
{ … }


int
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
                          Py_UCS4 *str,
                          Py_ssize_t size)
{ … }


PyObject*
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
{ … }

Py_UCS4
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
{ … }

/* Ensure that a string uses the most efficient storage, if it is not the
   case: create a new string with of the right kind. Write NULL into *p_unicode
   on error. */
static void
unicode_adjust_maxchar(PyObject **p_unicode)
{ … }

PyObject*
_PyUnicode_Copy(PyObject *unicode)
{ … }


/* Widen Unicode objects to larger buffers. Don't write terminating null
   character. Return NULL on error. */

static void*
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
{ … }

static Py_UCS4*
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
        int copy_null)
{ … }

Py_UCS4*
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
                 int copy_null)
{ … }

Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject *string)
{ … }

/* maximum number of characters required for output of %jo or %jd or %p.
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
   plus 1 for the terminal NUL. */
#define MAX_INTMAX_CHARS …

static int
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
                             Py_ssize_t width, Py_ssize_t precision, int flags)
{ … }

static int
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
                              Py_ssize_t width, Py_ssize_t precision, int flags)
{ … }

static int
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
                              Py_ssize_t width, Py_ssize_t precision, int flags)
{ … }

#define F_LONG …
#define F_LONGLONG …
#define F_SIZE …
#define F_PTRDIFF …
#define F_INTMAX …

static const char*
unicode_fromformat_arg(_PyUnicodeWriter *writer,
                       const char *f, va_list *vargs)
{ … }

static int
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
{ … }

PyObject *
PyUnicode_FromFormatV(const char *format, va_list vargs)
{ … }

PyObject *
PyUnicode_FromFormat(const char *format, ...)
{ … }

int
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
{ … }

static Py_ssize_t
unicode_get_widechar_size(PyObject *unicode)
{ … }

static void
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
{ … }

#ifdef HAVE_WCHAR_H

/* Convert a Unicode object to a wide character string.

   - If w is NULL: return the number of wide characters (including the null
     character) required to convert the unicode object. Ignore size argument.

   - Otherwise: return the number of wide characters (excluding the null
     character) written into w. Write at most size wide characters (including
     the null character). */
Py_ssize_t
PyUnicode_AsWideChar(PyObject *unicode,
                     wchar_t *w,
                     Py_ssize_t size)
{ … }

wchar_t*
PyUnicode_AsWideCharString(PyObject *unicode,
                           Py_ssize_t *size)
{ … }

#endif /* HAVE_WCHAR_H */

int
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
{ … }

int
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
{ … }

PyObject *
PyUnicode_FromOrdinal(int ordinal)
{ … }

PyObject *
PyUnicode_FromObject(PyObject *obj)
{ … }

PyObject *
PyUnicode_FromEncodedObject(PyObject *obj,
                            const char *encoding,
                            const char *errors)
{ … }

/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
   longer than lower_len-1). */
int
_Py_normalize_encoding(const char *encoding,
                       char *lower,
                       size_t lower_len)
{ … }

PyObject *
PyUnicode_Decode(const char *s,
                 Py_ssize_t size,
                 const char *encoding,
                 const char *errors)
{ … }

PyObject *
PyUnicode_AsDecodedObject(PyObject *unicode,
                          const char *encoding,
                          const char *errors)
{ … }

PyObject *
PyUnicode_AsDecodedUnicode(PyObject *unicode,
                           const char *encoding,
                           const char *errors)
{ … }

PyObject *
PyUnicode_AsEncodedObject(PyObject *unicode,
                          const char *encoding,
                          const char *errors)
{ … }


static PyObject *
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
                      int current_locale)
{ … }

PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
{ … }

PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode)
{ … }

PyObject *
PyUnicode_AsEncodedString(PyObject *unicode,
                          const char *encoding,
                          const char *errors)
{ … }

PyObject *
PyUnicode_AsEncodedUnicode(PyObject *unicode,
                           const char *encoding,
                           const char *errors)
{ … }

static PyObject*
unicode_decode_locale(const char *str, Py_ssize_t len,
                      _Py_error_handler errors, int current_locale)
{ … }

PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                              const char *errors)
{ … }

PyObject*
PyUnicode_DecodeLocale(const char *str, const char *errors)
{ … }


PyObject*
PyUnicode_DecodeFSDefault(const char *s) { … }

PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
{ … }


int
PyUnicode_FSConverter(PyObject* arg, void* addr)
{ … }


int
PyUnicode_FSDecoder(PyObject* arg, void* addr)
{ … }


static int unicode_fill_utf8(PyObject *unicode);

const char *
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
{ … }

const char *
PyUnicode_AsUTF8(PyObject *unicode)
{ … }

const char *
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
{ … }

/*
PyUnicode_GetSize() has been deprecated since Python 3.3
because it returned length of Py_UNICODE.

But this function is part of stable abi, because it doesn't
include Py_UNICODE in signature and it was not excluded from
stable ABI in PEP 384.
*/
PyAPI_FUNC(Py_ssize_t)
PyUnicode_GetSize(PyObject *unicode)
{ … }

Py_ssize_t
PyUnicode_GetLength(PyObject *unicode)
{ … }

Py_UCS4
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
{ … }

int
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
{ … }

const char *
PyUnicode_GetDefaultEncoding(void)
{ … }

/* create or adjust a UnicodeDecodeError */
static void
make_decode_exception(PyObject **exceptionObject,
                      const char *encoding,
                      const char *input, Py_ssize_t length,
                      Py_ssize_t startpos, Py_ssize_t endpos,
                      const char *reason)
{ … }

#ifdef MS_WINDOWS
static int
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
{
    if (newsize > *size) {
        wchar_t *newbuf = *buf;
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
            PyErr_NoMemory();
            return -1;
        }
        *buf = newbuf;
    }
    *size = newsize;
    return 0;
}

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   if no exception occurred, copy the replacement to the output
   and adjust various state variables.
   return 0 on success, -1 on error
*/

static int
unicode_decode_call_errorhandler_wchar(
    const char *errors, PyObject **errorHandler,
    const char *encoding, const char *reason,
    const char **input, const char **inend, Py_ssize_t *startinpos,
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
{
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";

    PyObject *restuple = NULL;
    PyObject *repunicode = NULL;
    Py_ssize_t outsize;
    Py_ssize_t insize;
    Py_ssize_t requiredsize;
    Py_ssize_t newpos;
    PyObject *inputobj = NULL;
    Py_ssize_t repwlen;

    if (*errorHandler == NULL) {
        *errorHandler = PyCodec_LookupError(errors);
        if (*errorHandler == NULL)
            goto onError;
    }

    make_decode_exception(exceptionObject,
        encoding,
        *input, *inend - *input,
        *startinpos, *endinpos,
        reason);
    if (*exceptionObject == NULL)
        goto onError;

    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
    if (restuple == NULL)
        goto onError;
    if (!PyTuple_Check(restuple)) {
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
        goto onError;
    }
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
        goto onError;

    /* Copy back the bytes variables, which might have been modified by the
       callback */
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    if (!inputobj)
        goto onError;
    *input = PyBytes_AS_STRING(inputobj);
    insize = PyBytes_GET_SIZE(inputobj);
    *inend = *input + insize;
    /* we can DECREF safely, as the exception has another reference,
       so the object won't go away. */
    Py_DECREF(inputobj);

    if (newpos<0)
        newpos = insize+newpos;
    if (newpos<0 || newpos>insize) {
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
        goto onError;
    }

    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
    if (repwlen < 0)
        goto onError;
    repwlen--;
    /* need more space? (at least enough for what we
       have+the replacement+the rest of the string (starting
       at the new input position), so we won't have to check space
       when there are no errors in the rest of the string) */
    requiredsize = *outpos;
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
        goto overflow;
    requiredsize += repwlen;
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
        goto overflow;
    requiredsize += insize - newpos;
    outsize = *bufsize;
    if (requiredsize > outsize) {
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
            requiredsize = 2*outsize;
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
            goto onError;
        }
    }
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
    *outpos += repwlen;
    *endinpos = newpos;
    *inptr = *input + newpos;

    /* we made it! */
    Py_DECREF(restuple);
    return 0;

  overflow:
    PyErr_SetString(PyExc_OverflowError,
                    "decoded result is too long for a Python string");

  onError:
    Py_XDECREF(restuple);
    return -1;
}
#endif   /* MS_WINDOWS */

static int
unicode_decode_call_errorhandler_writer(
    const char *errors, PyObject **errorHandler,
    const char *encoding, const char *reason,
    const char **input, const char **inend, Py_ssize_t *startinpos,
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
{ … }

/* --- UTF-7 Codec -------------------------------------------------------- */

/* See RFC2152 for details.  We encode conservatively and decode liberally. */

/* Three simple macros defining base-64. */

/* Is c a base-64 character? */

#define IS_BASE64 …

/* given that c is a base-64 character, what is its base-64 value? */

#define FROM_BASE64 …

/* What is the base-64 character of the bottom 6 bits of n? */

#define TO_BASE64 …

/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
 * decoded as itself.  We are permissive on decoding; the only ASCII
 * byte not decoding to itself is the + which begins a base64
 * string. */

#define DECODE_DIRECT …

/* The UTF-7 encoder treats ASCII characters differently according to
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
 * the above).  See RFC2152.  This array identifies these different
 * sets:
 * 0 : "Set D"
 *     alphanumeric and '(),-./:?
 * 1 : "Set O"
 *     !"#$%&*;<=>@[]^_`{|}
 * 2 : "whitespace"
 *     ht nl cr sp
 * 3 : special (must be base64 encoded)
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
 */

static
char utf7_category[128] = …;

/* ENCODE_DIRECT: this character should be encoded as itself.  The
 * answer depends on whether we are encoding set O as itself, and also
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
 * clear that the answers to these questions vary between
 * applications, so this code needs to be flexible.  */

#define ENCODE_DIRECT …

PyObject *
PyUnicode_DecodeUTF7(const char *s,
                     Py_ssize_t size,
                     const char *errors)
{ … }

/* The decoder.  The only state we preserve is our read position,
 * i.e. how many characters we have consumed.  So if we end in the
 * middle of a shift sequence we have to back off the read position
 * and the output to the beginning of the sequence, otherwise we lose
 * all the shift state (seen bits, number of bits seen, high
 * surrogate). */

PyObject *
PyUnicode_DecodeUTF7Stateful(const char *s,
                             Py_ssize_t size,
                             const char *errors,
                             Py_ssize_t *consumed)
{ … }


PyObject *
_PyUnicode_EncodeUTF7(PyObject *str,
                      int base64SetO,
                      int base64WhiteSpace,
                      const char *errors)
{ … }

#undef IS_BASE64
#undef FROM_BASE64
#undef TO_BASE64
#undef DECODE_DIRECT
#undef ENCODE_DIRECT

/* --- UTF-8 Codec -------------------------------------------------------- */

PyObject *
PyUnicode_DecodeUTF8(const char *s,
                     Py_ssize_t size,
                     const char *errors)
{ … }

#include "stringlib/asciilib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#include "stringlib/ucs2lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#include "stringlib/ucs4lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#if (SIZEOF_SIZE_T == 8)
/* Mask to quickly check whether a C 'size_t' contains a
   non-ASCII, UTF8-encoded char. */
#define ASCII_CHAR_MASK …
// used to count codepoints in UTF-8 string.
#define VECTOR_0101 …
#define VECTOR_00FF …
#elif (SIZEOF_SIZE_T == 4)
#define ASCII_CHAR_MASK …
#define VECTOR_0101 …
#define VECTOR_00FF …
#else
# error C 'size_t' size should be either 4 or 8!
#endif

#if (defined(__clang__) || defined(__GNUC__))
#define HAVE_CTZ …
static inline unsigned int
ctz(size_t v)
{ … }
#elif defined(_MSC_VER)
#define HAVE_CTZ …
static inline unsigned int
ctz(size_t v)
{
    unsigned long pos;
#if SIZEOF_SIZE_T == 4
    _BitScanForward(&pos, v);
#else
    _BitScanForward64(&pos, v);
#endif /* SIZEOF_SIZE_T */
    return pos;
}
#else
#define HAVE_CTZ …
#endif

#if HAVE_CTZ && PY_LITTLE_ENDIAN
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
static size_t
load_unaligned(const unsigned char *p, size_t size)
{ … }
#endif

/*
 * Find the first non-ASCII character in a byte sequence.
 *
 * This function scans a range of bytes from `start` to `end` and returns the
 * index of the first byte that is not an ASCII character (i.e., has the most
 * significant bit set). If all characters in the range are ASCII, it returns
 * `end - start`.
 */
static Py_ssize_t
find_first_nonascii(const unsigned char *start, const unsigned char *end)
{ … }

static inline int
scalar_utf8_start_char(unsigned int ch)
{ … }

static inline size_t
vector_utf8_start_chars(size_t v)
{ … }


// Count the number of UTF-8 code points in a given byte sequence.
static Py_ssize_t
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
{ … }

static Py_ssize_t
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
{ … }

static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
                         const char *starts, const char *s, const char *end,
                         _Py_error_handler error_handler,
                         const char *errors,
                         Py_ssize_t *consumed)
{ … }


static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
                    _Py_error_handler error_handler, const char *errors,
                    Py_ssize_t *consumed)
{ … }


// Used by PyUnicodeWriter_WriteUTF8() implementation
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
                           _Py_error_handler error_handler, const char *errors,
                           Py_ssize_t *consumed)
{ … }


PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
                             Py_ssize_t size,
                             const char *errors,
                             Py_ssize_t *consumed)
{ … }


/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
   non-zero, use strict error handler otherwise.

   On success, write a pointer to a newly allocated wide character string into
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
   (in number of wchar_t units) into *wlen (if wlen is set).

   On memory allocation failure, return -1.

   On decoding error (if surrogateescape is zero), return -2. If wlen is
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
   is not NULL, write the decoding error message into *reason. */
int
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
                 const char **reason, _Py_error_handler errors)
{ … }


wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
                               size_t *wlen)
{ … }


/* UTF-8 encoder.

   On success, return 0 and write the newly allocated character string (use
   PyMem_Free() to free the memory) into *str.

   On encoding failure, return -2 and write the position of the invalid
   surrogate character into *error_pos (if error_pos is set) and the decoding
   error message into *reason (if reason is set).

   On memory allocation failure, return -1. */
int
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
                 const char **reason, int raw_malloc, _Py_error_handler errors)
{ … }


/* Primary internal function which creates utf8 encoded bytes objects.

   Allocation strategy:  if the string is short, convert into a stack buffer
   and allocate exactly as much space needed at the end.  Else allocate the
   maximum possible needed (4 result bytes per Unicode character), and return
   the excess memory at the end.
*/
static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
                    const char *errors)
{ … }

static int
unicode_fill_utf8(PyObject *unicode)
{ … }

PyObject *
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
{ … }


PyObject *
PyUnicode_AsUTF8String(PyObject *unicode)
{ … }

/* --- UTF-32 Codec ------------------------------------------------------- */

PyObject *
PyUnicode_DecodeUTF32(const char *s,
                      Py_ssize_t size,
                      const char *errors,
                      int *byteorder)
{ … }

PyObject *
PyUnicode_DecodeUTF32Stateful(const char *s,
                              Py_ssize_t size,
                              const char *errors,
                              int *byteorder,
                              Py_ssize_t *consumed)
{ … }

PyObject *
_PyUnicode_EncodeUTF32(PyObject *str,
                       const char *errors,
                       int byteorder)
{ … }

PyObject *
PyUnicode_AsUTF32String(PyObject *unicode)
{ … }

/* --- UTF-16 Codec ------------------------------------------------------- */

PyObject *
PyUnicode_DecodeUTF16(const char *s,
                      Py_ssize_t size,
                      const char *errors,
                      int *byteorder)
{ … }

PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
                              Py_ssize_t size,
                              const char *errors,
                              int *byteorder,
                              Py_ssize_t *consumed)
{ … }

PyObject *
_PyUnicode_EncodeUTF16(PyObject *str,
                       const char *errors,
                       int byteorder)
{ … }

PyObject *
PyUnicode_AsUTF16String(PyObject *unicode)
{ … }

_PyUnicode_Name_CAPI *
_PyUnicode_GetNameCAPI(void)
{ … }

/* --- Unicode Escape Codec ----------------------------------------------- */

PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                               Py_ssize_t size,
                               const char *errors,
                               Py_ssize_t *consumed,
                               const char **first_invalid_escape)
{ … }

PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                              Py_ssize_t size,
                              const char *errors,
                              Py_ssize_t *consumed)
{ … }

PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
                              Py_ssize_t size,
                              const char *errors)
{ … }

/* Return a Unicode-Escape string version of the Unicode object. */

PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
{ … }

/* --- Raw Unicode Escape Codec ------------------------------------------- */

PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
                                          Py_ssize_t size,
                                          const char *errors,
                                          Py_ssize_t *consumed)
{ … }

PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
                                 Py_ssize_t size,
                                 const char *errors)
{ … }


PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{ … }

/* --- Latin-1 Codec ------------------------------------------------------ */

PyObject *
PyUnicode_DecodeLatin1(const char *s,
                       Py_ssize_t size,
                       const char *errors)
{ … }

/* create or adjust a UnicodeEncodeError */
static void
make_encode_exception(PyObject **exceptionObject,
                      const char *encoding,
                      PyObject *unicode,
                      Py_ssize_t startpos, Py_ssize_t endpos,
                      const char *reason)
{ … }

/* raises a UnicodeEncodeError */
static void
raise_encode_exception(PyObject **exceptionObject,
                       const char *encoding,
                       PyObject *unicode,
                       Py_ssize_t startpos, Py_ssize_t endpos,
                       const char *reason)
{ … }

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   put the result into newpos and return the replacement string, which
   has to be freed by the caller */
static PyObject *
unicode_encode_call_errorhandler(const char *errors,
                                 PyObject **errorHandler,
                                 const char *encoding, const char *reason,
                                 PyObject *unicode, PyObject **exceptionObject,
                                 Py_ssize_t startpos, Py_ssize_t endpos,
                                 Py_ssize_t *newpos)
{ … }

static PyObject *
unicode_encode_ucs1(PyObject *unicode,
                    const char *errors,
                    const Py_UCS4 limit)
{ … }

PyObject *
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
{ … }

PyObject*
PyUnicode_AsLatin1String(PyObject *unicode)
{ … }

/* --- 7-bit ASCII Codec -------------------------------------------------- */

PyObject *
PyUnicode_DecodeASCII(const char *s,
                      Py_ssize_t size,
                      const char *errors)
{ … }

PyObject *
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
{ … }

PyObject *
PyUnicode_AsASCIIString(PyObject *unicode)
{ … }

#ifdef MS_WINDOWS

/* --- MBCS codecs for Windows -------------------------------------------- */

#if SIZEOF_INT < SIZEOF_SIZE_T
#define NEED_RETRY
#endif

/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
   transcoding from UTF-16), but INT_MAX / 4 performs better in
   both cases also and avoids partial characters overrunning the
   length limit in MultiByteToWideChar on Windows */
#define DECODING_CHUNK_SIZE …

#ifndef WC_ERR_INVALID_CHARS
#define WC_ERR_INVALID_CHARS …
#endif

static const char*
code_page_name(UINT code_page, PyObject **obj)
{
    *obj = NULL;
    if (code_page == CP_ACP)
        return "mbcs";
    if (code_page == CP_UTF7)
        return "CP_UTF7";
    if (code_page == CP_UTF8)
        return "CP_UTF8";

    *obj = PyBytes_FromFormat("cp%u", code_page);
    if (*obj == NULL)
        return NULL;
    return PyBytes_AS_STRING(*obj);
}

static DWORD
decode_code_page_flags(UINT code_page)
{
    if (code_page == CP_UTF7) {
        /* The CP_UTF7 decoder only supports flags=0 */
        return 0;
    }
    else
        return MB_ERR_INVALID_CHARS;
}

/*
 * Decode a byte string from a Windows code page into unicode object in strict
 * mode.
 *
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
 * OSError and returns -1 on other error.
 */
static int
decode_code_page_strict(UINT code_page,
                        wchar_t **buf,
                        Py_ssize_t *bufsize,
                        const char *in,
                        int insize)
{
    DWORD flags = MB_ERR_INVALID_CHARS;
    wchar_t *out;
    DWORD outsize;

    /* First get the size of the result */
    assert(insize > 0);
    while ((outsize = MultiByteToWideChar(code_page, flags,
                                          in, insize, NULL, 0)) <= 0)
    {
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
            goto error;
        }
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
        flags = 0;
    }

    /* Extend a wchar_t* buffer */
    Py_ssize_t n = *bufsize;   /* Get the current length */
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
        return -1;
    }
    out = *buf + n;

    /* Do the conversion */
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
    if (outsize <= 0)
        goto error;
    return insize;

error:
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
        return -2;
    PyErr_SetFromWindowsErr(0);
    return -1;
}

/*
 * Decode a byte string from a code page into unicode object with an error
 * handler.
 *
 * Returns consumed size if succeed, or raise an OSError or
 * UnicodeDecodeError exception and returns -1 on error.
 */
static int
decode_code_page_errors(UINT code_page,
                        wchar_t **buf,
                        Py_ssize_t *bufsize,
                        const char *in, const int size,
                        const char *errors, int final)
{
    const char *startin = in;
    const char *endin = in + size;
    DWORD flags = MB_ERR_INVALID_CHARS;
    /* Ideally, we should get reason from FormatMessage. This is the Windows
       2000 English version of the message. */
    const char *reason = "No mapping for the Unicode character exists "
                         "in the target code page.";
    /* each step cannot decode more than 1 character, but a character can be
       represented as a surrogate pair */
    wchar_t buffer[2], *out;
    int insize;
    Py_ssize_t outsize;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *encoding_obj = NULL;
    const char *encoding;
    DWORD err;
    int ret = -1;

    assert(size > 0);

    encoding = code_page_name(code_page, &encoding_obj);
    if (encoding == NULL)
        return -1;

    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
           UnicodeDecodeError. */
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
        if (exc != NULL) {
            PyCodec_StrictErrors(exc);
            Py_CLEAR(exc);
        }
        goto error;
    }

    /* Extend a wchar_t* buffer */
    Py_ssize_t n = *bufsize;   /* Get the current length */
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
        PyErr_NoMemory();
        goto error;
    }
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
        goto error;
    }
    out = *buf + n;

    /* Decode the byte string character per character */
    while (in < endin)
    {
        /* Decode a character */
        insize = 1;
        do
        {
            outsize = MultiByteToWideChar(code_page, flags,
                                          in, insize,
                                          buffer, Py_ARRAY_LENGTH(buffer));
            if (outsize > 0)
                break;
            err = GetLastError();
            if (err == ERROR_INVALID_FLAGS && flags) {
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
                flags = 0;
                continue;
            }
            if (err != ERROR_NO_UNICODE_TRANSLATION
                && err != ERROR_INSUFFICIENT_BUFFER)
            {
                PyErr_SetFromWindowsErr(err);
                goto error;
            }
            insize++;
        }
        /* 4=maximum length of a UTF-8 sequence */
        while (insize <= 4 && (in + insize) <= endin);

        if (outsize <= 0) {
            Py_ssize_t startinpos, endinpos, outpos;

            /* last character in partial decode? */
            if (in + insize >= endin && !final)
                break;

            startinpos = in - startin;
            endinpos = startinpos + 1;
            outpos = out - *buf;
            if (unicode_decode_call_errorhandler_wchar(
                    errors, &errorHandler,
                    encoding, reason,
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
                    buf, bufsize, &outpos))
            {
                goto error;
            }
            out = *buf + outpos;
        }
        else {
            in += insize;
            memcpy(out, buffer, outsize * sizeof(wchar_t));
            out += outsize;
        }
    }

    /* Shrink the buffer */
    assert(out - *buf <= *bufsize);
    *bufsize = out - *buf;
    /* (in - startin) <= size and size is an int */
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);

error:
    Py_XDECREF(encoding_obj);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return ret;
}

static PyObject *
decode_code_page_stateful(int code_page,
                          const char *s, Py_ssize_t size,
                          const char *errors, Py_ssize_t *consumed)
{
    wchar_t *buf = NULL;
    Py_ssize_t bufsize = 0;
    int chunk_size, final, converted, done;

    if (code_page < 0) {
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
        return NULL;
    }
    if (size < 0) {
        PyErr_BadInternalCall();
        return NULL;
    }

    if (consumed)
        *consumed = 0;

    do
    {
#ifdef NEED_RETRY
        if (size > DECODING_CHUNK_SIZE) {
            chunk_size = DECODING_CHUNK_SIZE;
            final = 0;
            done = 0;
        }
        else
#endif
        {
            chunk_size = (int)size;
            final = (consumed == NULL);
            done = 1;
        }

        if (chunk_size == 0 && done) {
            if (buf != NULL)
                break;
            _Py_RETURN_UNICODE_EMPTY();
        }

        converted = decode_code_page_strict(code_page, &buf, &bufsize,
                                            s, chunk_size);
        if (converted == -2)
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
                                                s, chunk_size,
                                                errors, final);
        assert(converted != 0 || done);

        if (converted < 0) {
            PyMem_Free(buf);
            return NULL;
        }

        if (consumed)
            *consumed += converted;

        s += converted;
        size -= converted;
    } while (!done);

    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
    PyMem_Free(buf);
    return v;
}

PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,
                                 const char *s,
                                 Py_ssize_t size,
                                 const char *errors,
                                 Py_ssize_t *consumed)
{
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
}

PyObject *
PyUnicode_DecodeMBCSStateful(const char *s,
                             Py_ssize_t size,
                             const char *errors,
                             Py_ssize_t *consumed)
{
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
}

PyObject *
PyUnicode_DecodeMBCS(const char *s,
                     Py_ssize_t size,
                     const char *errors)
{
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}

static DWORD
encode_code_page_flags(UINT code_page, const char *errors)
{
    if (code_page == CP_UTF8) {
        return WC_ERR_INVALID_CHARS;
    }
    else if (code_page == CP_UTF7) {
        /* CP_UTF7 only supports flags=0 */
        return 0;
    }
    else {
        if (errors != NULL && strcmp(errors, "replace") == 0)
            return 0;
        else
            return WC_NO_BEST_FIT_CHARS;
    }
}

/*
 * Encode a Unicode string to a Windows code page into a byte string in strict
 * mode.
 *
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
 * an OSError and returns -1 on other error.
 */
static int
encode_code_page_strict(UINT code_page, PyObject **outbytes,
                        PyObject *unicode, Py_ssize_t offset, int len,
                        const char* errors)
{
    BOOL usedDefaultChar = FALSE;
    BOOL *pusedDefaultChar = &usedDefaultChar;
    int outsize;
    wchar_t *p;
    Py_ssize_t size;
    const DWORD flags = encode_code_page_flags(code_page, NULL);
    char *out;
    /* Create a substring so that we can get the UTF-16 representation
       of just the slice under consideration. */
    PyObject *substring;
    int ret = -1;

    assert(len > 0);

    if (code_page != CP_UTF8 && code_page != CP_UTF7)
        pusedDefaultChar = &usedDefaultChar;
    else
        pusedDefaultChar = NULL;

    substring = PyUnicode_Substring(unicode, offset, offset+len);
    if (substring == NULL)
        return -1;
    p = PyUnicode_AsWideCharString(substring, &size);
    Py_CLEAR(substring);
    if (p == NULL) {
        return -1;
    }
    assert(size <= INT_MAX);

    /* First get the size of the result */
    outsize = WideCharToMultiByte(code_page, flags,
                                  p, (int)size,
                                  NULL, 0,
                                  NULL, pusedDefaultChar);
    if (outsize <= 0)
        goto error;
    /* If we used a default char, then we failed! */
    if (pusedDefaultChar && *pusedDefaultChar) {
        ret = -2;
        goto done;
    }

    if (*outbytes == NULL) {
        /* Create string object */
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
        if (*outbytes == NULL) {
            goto done;
        }
        out = PyBytes_AS_STRING(*outbytes);
    }
    else {
        /* Extend string object */
        const Py_ssize_t n = PyBytes_Size(*outbytes);
        if (outsize > PY_SSIZE_T_MAX - n) {
            PyErr_NoMemory();
            goto done;
        }
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
            goto done;
        }
        out = PyBytes_AS_STRING(*outbytes) + n;
    }

    /* Do the conversion */
    outsize = WideCharToMultiByte(code_page, flags,
                                  p, (int)size,
                                  out, outsize,
                                  NULL, pusedDefaultChar);
    if (outsize <= 0)
        goto error;
    if (pusedDefaultChar && *pusedDefaultChar) {
        ret = -2;
        goto done;
    }
    ret = 0;

done:
    PyMem_Free(p);
    return ret;

error:
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
        ret = -2;
        goto done;
    }
    PyErr_SetFromWindowsErr(0);
    goto done;
}

/*
 * Encode a Unicode string to a Windows code page into a byte string using an
 * error handler.
 *
 * Returns consumed characters if succeed, or raise an OSError and returns
 * -1 on other error.
 */
static int
encode_code_page_errors(UINT code_page, PyObject **outbytes,
                        PyObject *unicode, Py_ssize_t unicode_offset,
                        Py_ssize_t insize, const char* errors)
{
    const DWORD flags = encode_code_page_flags(code_page, errors);
    Py_ssize_t pos = unicode_offset;
    Py_ssize_t endin = unicode_offset + insize;
    /* Ideally, we should get reason from FormatMessage. This is the Windows
       2000 English version of the message. */
    const char *reason = "invalid character";
    /* 4=maximum length of a UTF-8 sequence */
    char buffer[4];
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
    Py_ssize_t outsize;
    char *out;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *encoding_obj = NULL;
    const char *encoding;
    Py_ssize_t newpos, newoutsize;
    PyObject *rep;
    int ret = -1;

    assert(insize > 0);

    encoding = code_page_name(code_page, &encoding_obj);
    if (encoding == NULL)
        return -1;

    if (errors == NULL || strcmp(errors, "strict") == 0) {
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
           then we raise a UnicodeEncodeError. */
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
        if (exc != NULL) {
            PyCodec_StrictErrors(exc);
            Py_DECREF(exc);
        }
        Py_XDECREF(encoding_obj);
        return -1;
    }

    if (code_page != CP_UTF8 && code_page != CP_UTF7)
        pusedDefaultChar = &usedDefaultChar;
    else
        pusedDefaultChar = NULL;

    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
        PyErr_NoMemory();
        goto error;
    }
    outsize = insize * Py_ARRAY_LENGTH(buffer);

    if (*outbytes == NULL) {
        /* Create string object */
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
        if (*outbytes == NULL)
            goto error;
        out = PyBytes_AS_STRING(*outbytes);
    }
    else {
        /* Extend string object */
        Py_ssize_t n = PyBytes_Size(*outbytes);
        if (n > PY_SSIZE_T_MAX - outsize) {
            PyErr_NoMemory();
            goto error;
        }
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
            goto error;
        out = PyBytes_AS_STRING(*outbytes) + n;
    }

    /* Encode the string character per character */
    while (pos < endin)
    {
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
        wchar_t chars[2];
        int charsize;
        if (ch < 0x10000) {
            chars[0] = (wchar_t)ch;
            charsize = 1;
        }
        else {
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
            charsize = 2;
        }

        outsize = WideCharToMultiByte(code_page, flags,
                                      chars, charsize,
                                      buffer, Py_ARRAY_LENGTH(buffer),
                                      NULL, pusedDefaultChar);
        if (outsize > 0) {
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
            {
                pos++;
                memcpy(out, buffer, outsize);
                out += outsize;
                continue;
            }
        }
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
            PyErr_SetFromWindowsErr(0);
            goto error;
        }

        rep = unicode_encode_call_errorhandler(
                  errors, &errorHandler, encoding, reason,
                  unicode, &exc,
                  pos, pos + 1, &newpos);
        if (rep == NULL)
            goto error;

        Py_ssize_t morebytes = pos - newpos;
        if (PyBytes_Check(rep)) {
            outsize = PyBytes_GET_SIZE(rep);
            morebytes += outsize;
            if (morebytes > 0) {
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                    Py_DECREF(rep);
                    goto error;
                }
                out = PyBytes_AS_STRING(*outbytes) + offset;
            }
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
            out += outsize;
        }
        else {
            Py_ssize_t i;
            int kind;
            const void *data;

            outsize = PyUnicode_GET_LENGTH(rep);
            morebytes += outsize;
            if (morebytes > 0) {
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
                    Py_DECREF(rep);
                    goto error;
                }
                out = PyBytes_AS_STRING(*outbytes) + offset;
            }
            kind = PyUnicode_KIND(rep);
            data = PyUnicode_DATA(rep);
            for (i=0; i < outsize; i++) {
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
                if (ch > 127) {
                    raise_encode_exception(&exc,
                        encoding, unicode,
                        pos, pos + 1,
                        "unable to encode error handler result to ASCII");
                    Py_DECREF(rep);
                    goto error;
                }
                *out = (unsigned char)ch;
                out++;
            }
        }
        pos = newpos;
        Py_DECREF(rep);
    }
    /* write a NUL byte */
    *out = 0;
    outsize = out - PyBytes_AS_STRING(*outbytes);
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
    if (_PyBytes_Resize(outbytes, outsize) < 0)
        goto error;
    ret = 0;

error:
    Py_XDECREF(encoding_obj);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return ret;
}

static PyObject *
encode_code_page(int code_page,
                 PyObject *unicode,
                 const char *errors)
{
    Py_ssize_t len;
    PyObject *outbytes = NULL;
    Py_ssize_t offset;
    int chunk_len, ret, done;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }

    len = PyUnicode_GET_LENGTH(unicode);

    if (code_page < 0) {
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
        return NULL;
    }

    if (len == 0)
        return PyBytes_FromStringAndSize(NULL, 0);

    offset = 0;
    do
    {
#ifdef NEED_RETRY
        if (len > DECODING_CHUNK_SIZE) {
            chunk_len = DECODING_CHUNK_SIZE;
            done = 0;
        }
        else
#endif
        {
            chunk_len = (int)len;
            done = 1;
        }

        ret = encode_code_page_strict(code_page, &outbytes,
                                      unicode, offset, chunk_len,
                                      errors);
        if (ret == -2)
            ret = encode_code_page_errors(code_page, &outbytes,
                                          unicode, offset,
                                          chunk_len, errors);
        if (ret < 0) {
            Py_XDECREF(outbytes);
            return NULL;
        }

        offset += chunk_len;
        len -= chunk_len;
    } while (!done);

    return outbytes;
}

PyObject *
PyUnicode_EncodeCodePage(int code_page,
                         PyObject *unicode,
                         const char *errors)
{
    return encode_code_page(code_page, unicode, errors);
}

PyObject *
PyUnicode_AsMBCSString(PyObject *unicode)
{
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
}

#undef NEED_RETRY

#endif /* MS_WINDOWS */

/* --- Character Mapping Codec -------------------------------------------- */

static int
charmap_decode_string(const char *s,
                      Py_ssize_t size,
                      PyObject *mapping,
                      const char *errors,
                      _PyUnicodeWriter *writer)
{ … }

static int
charmap_decode_mapping(const char *s,
                       Py_ssize_t size,
                       PyObject *mapping,
                       const char *errors,
                       _PyUnicodeWriter *writer)
{ … }

PyObject *
PyUnicode_DecodeCharmap(const char *s,
                        Py_ssize_t size,
                        PyObject *mapping,
                        const char *errors)
{ … }

/* Charmap encoding: the lookup table */

/*[clinic input]
class EncodingMap "struct encoding_map *" "&EncodingMapType"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/

struct encoding_map { … };

/*[clinic input]
EncodingMap.size

Return the size (in bytes) of this object.
[clinic start generated code]*/

static PyObject *
EncodingMap_size_impl(struct encoding_map *self)
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
{ … }

static PyMethodDef encoding_map_methods[] = …;

static PyTypeObject EncodingMapType = …;

PyObject*
PyUnicode_BuildEncodingMap(PyObject* string)
{ … }

static int
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
{ … }

/* Lookup the character in the mapping.
   On success, return PyLong, PyBytes or None (if the character can't be found).
   If the result is PyLong, put its value in replace.
   On error, return NULL.
   */
static PyObject *
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
{ … }

static int
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
{ … }

charmapencode_result;
/* lookup the character, put the result in the output string and adjust
   various state variables. Resize the output bytes object if not enough
   space is available. Return a new reference to the object that
   was put in the output buffer, or Py_None, if the mapping was undefined
   (in which case no character was written) or NULL, if a
   reallocation error occurred. The caller must decref the result */
static charmapencode_result
charmapencode_output(Py_UCS4 c, PyObject *mapping,
                     PyObject **outobj, Py_ssize_t *outpos)
{ … }

/* handle an error in PyUnicode_EncodeCharmap
   Return 0 on success, -1 on error */
static int
charmap_encoding_error(
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
    PyObject **exceptionObject,
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
    PyObject **res, Py_ssize_t *respos)
{ … }

PyObject *
_PyUnicode_EncodeCharmap(PyObject *unicode,
                         PyObject *mapping,
                         const char *errors)
{ … }

PyObject *
PyUnicode_AsCharmapString(PyObject *unicode,
                          PyObject *mapping)
{ … }

/* create or adjust a UnicodeTranslateError */
static void
make_translate_exception(PyObject **exceptionObject,
                         PyObject *unicode,
                         Py_ssize_t startpos, Py_ssize_t endpos,
                         const char *reason)
{ … }

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   put the result into newpos and return the replacement string, which
   has to be freed by the caller */
static PyObject *
unicode_translate_call_errorhandler(const char *errors,
                                    PyObject **errorHandler,
                                    const char *reason,
                                    PyObject *unicode, PyObject **exceptionObject,
                                    Py_ssize_t startpos, Py_ssize_t endpos,
                                    Py_ssize_t *newpos)
{ … }

/* Lookup the character ch in the mapping and put the result in result,
   which must be decrefed by the caller.
   The result can be PyLong, PyUnicode, None or NULL.
   If the result is PyLong, put its value in replace.
   Return 0 on success, -1 on error */
static int
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
{ … }

/* lookup the character, write the result into the writer.
   Return 1 if the result was written into the writer, return 0 if the mapping
   was undefined, raise an exception return -1 on error. */
static int
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
                        _PyUnicodeWriter *writer)
{ … }

static int
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
                              Py_UCS1 *translate)
{ … }

/* Fast path for ascii => ascii translation. Return 1 if the whole string
   was translated into writer, return 0 if the input string was partially
   translated into writer, raise an exception and return -1 on error. */
static int
unicode_fast_translate(PyObject *input, PyObject *mapping,
                       _PyUnicodeWriter *writer, int ignore,
                       Py_ssize_t *input_pos)
{ … }

static PyObject *
_PyUnicode_TranslateCharmap(PyObject *input,
                            PyObject *mapping,
                            const char *errors)
{ … }

PyObject *
PyUnicode_Translate(PyObject *str,
                    PyObject *mapping,
                    const char *errors)
{ … }

PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{ … }

/* --- Helpers ------------------------------------------------------------ */

/* helper macro to fixup start/end slice values */
#define ADJUST_INDICES(start, end, len) …

static Py_ssize_t
any_find_slice(PyObject* s1, PyObject* s2,
               Py_ssize_t start,
               Py_ssize_t end,
               int direction)
{ … }

/* _PyUnicode_InsertThousandsGrouping() helper functions */
#include "stringlib/localeutil.h"

/**
 * InsertThousandsGrouping:
 * @writer: Unicode writer.
 * @n_buffer: Number of characters in @buffer.
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
 * @d_pos: Start of digits string.
 * @n_digits: The number of digits in the string, in which we want
 *            to put the grouping chars.
 * @min_width: The minimum width of the digits in the output string.
 *             Output will be zero-padded on the left to fill.
 * @grouping: see definition in localeconv().
 * @thousands_sep: see definition in localeconv().
 *
 * There are 2 modes: counting and filling. If @writer is NULL,
 *  we are in counting mode, else filling mode.
 * If counting, the required buffer size is returned.
 * If filling, we know the buffer will be large enough, so we don't
 *  need to pass in the buffer size.
 * Inserts thousand grouping characters (as defined by grouping and
 *  thousands_sep) into @writer.
 *
 * Return value: -1 on error, number of characters otherwise.
 **/
Py_ssize_t
_PyUnicode_InsertThousandsGrouping(
    _PyUnicodeWriter *writer,
    Py_ssize_t n_buffer,
    PyObject *digits,
    Py_ssize_t d_pos,
    Py_ssize_t n_digits,
    Py_ssize_t min_width,
    const char *grouping,
    PyObject *thousands_sep,
    Py_UCS4 *maxchar)
{ … }

Py_ssize_t
PyUnicode_Count(PyObject *str,
                PyObject *substr,
                Py_ssize_t start,
                Py_ssize_t end)
{ … }

Py_ssize_t
PyUnicode_Find(PyObject *str,
               PyObject *substr,
               Py_ssize_t start,
               Py_ssize_t end,
               int direction)
{ … }

Py_ssize_t
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
                   Py_ssize_t start, Py_ssize_t end,
                   int direction)
{ … }

static int
tailmatch(PyObject *self,
          PyObject *substring,
          Py_ssize_t start,
          Py_ssize_t end,
          int direction)
{ … }

Py_ssize_t
PyUnicode_Tailmatch(PyObject *str,
                    PyObject *substr,
                    Py_ssize_t start,
                    Py_ssize_t end,
                    int direction)
{ … }

static PyObject *
ascii_upper_or_lower(PyObject *self, int lower)
{ … }

static Py_UCS4
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
{ … }

static int
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
           Py_UCS4 c, Py_UCS4 *mapped)
{ … }

static Py_ssize_t
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{ … }

static Py_ssize_t
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { … }

static Py_ssize_t
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
                  Py_UCS4 *maxchar, int lower)
{ … }

static Py_ssize_t
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{ … }

static Py_ssize_t
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{ … }

static Py_ssize_t
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{ … }

static Py_ssize_t
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{ … }

static PyObject *
case_operation(PyObject *self,
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
{ … }

PyObject *
PyUnicode_Join(PyObject *separator, PyObject *seq)
{ … }

PyObject *
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
{ … }

void
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
                    Py_UCS4 fill_char)
{ … }

Py_ssize_t
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
               Py_UCS4 fill_char)
{ … }

static PyObject *
pad(PyObject *self,
    Py_ssize_t left,
    Py_ssize_t right,
    Py_UCS4 fill)
{ … }

PyObject *
PyUnicode_Splitlines(PyObject *string, int keepends)
{ … }

static PyObject *
split(PyObject *self,
      PyObject *substring,
      Py_ssize_t maxcount)
{ … }

static PyObject *
rsplit(PyObject *self,
       PyObject *substring,
       Py_ssize_t maxcount)
{ … }

static Py_ssize_t
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
{ … }

static Py_ssize_t
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
{ … }

static void
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
{ … }

static PyObject *
replace(PyObject *self, PyObject *str1,
        PyObject *str2, Py_ssize_t maxcount)
{ … }

/* --- Unicode Object Methods --------------------------------------------- */

/*[clinic input]
str.title as unicode_title

Return a version of the string where each word is titlecased.

More specifically, words start with uppercased characters and all remaining
cased characters have lower case.
[clinic start generated code]*/

static PyObject *
unicode_title_impl(PyObject *self)
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
{ … }

/*[clinic input]
str.capitalize as unicode_capitalize

Return a capitalized version of the string.

More specifically, make the first character have upper case and the rest lower
case.
[clinic start generated code]*/

static PyObject *
unicode_capitalize_impl(PyObject *self)
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
{ … }

/*[clinic input]
str.casefold as unicode_casefold

Return a version of the string suitable for caseless comparisons.
[clinic start generated code]*/

static PyObject *
unicode_casefold_impl(PyObject *self)
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
{ … }


/* Argument converter. Accepts a single Unicode character. */

static int
convert_uc(PyObject *obj, void *addr)
{ … }

/*[clinic input]
str.center as unicode_center

    width: Py_ssize_t
    fillchar: Py_UCS4 = ' '
    /

Return a centered string of length width.

Padding is done using the specified fill character (default is a space).
[clinic start generated code]*/

static PyObject *
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
{ … }

/* This function assumes that str1 and str2 are readied by the caller. */

static int
unicode_compare(PyObject *str1, PyObject *str2)
{ … }


int
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
{ … }


int
PyUnicode_Equal(PyObject *str1, PyObject *str2)
{ … }


int
PyUnicode_Compare(PyObject *left, PyObject *right)
{ … }

int
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
{ … }

int
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
{ … }

int
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
{ … }

int
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
{ … }

int
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
{ … }

PyObject *
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
{ … }

int
PyUnicode_Contains(PyObject *str, PyObject *substr)
{ … }

/* Concat to string or Unicode object giving a new Unicode object. */

PyObject *
PyUnicode_Concat(PyObject *left, PyObject *right)
{ … }

void
PyUnicode_Append(PyObject **p_left, PyObject *right)
{ … }

void
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
{ … }

/*[clinic input]
@text_signature "($self, sub[, start[, end]], /)"
str.count as unicode_count -> Py_ssize_t

    self as str: self
    sub as substr: unicode
    start: slice_index(accept={int, NoneType}, c_default='0') = None
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
    /

Return the number of non-overlapping occurrences of substring sub in string S[start:end].

Optional arguments start and end are interpreted as in slice notation.
[clinic start generated code]*/

static Py_ssize_t
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
                   Py_ssize_t end)
/*[clinic end generated code: output=8fcc3aef0b18edbf input=6f168ffd94be8785]*/
{ … }

/*[clinic input]
str.encode as unicode_encode

    encoding: str(c_default="NULL") = 'utf-8'
        The encoding in which to encode the string.
    errors: str(c_default="NULL") = 'strict'
        The error handling scheme to use for encoding errors.
        The default is 'strict' meaning that encoding errors raise a
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
        'xmlcharrefreplace' as well as any other name registered with
        codecs.register_error that can handle UnicodeEncodeErrors.

Encode the string using the codec registered for encoding.
[clinic start generated code]*/

static PyObject *
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
{ … }

/*[clinic input]
str.expandtabs as unicode_expandtabs

    tabsize: int = 8

Return a copy where all tab characters are expanded using spaces.

If tabsize is not given, a tab size of 8 characters is assumed.
[clinic start generated code]*/

static PyObject *
unicode_expandtabs_impl(PyObject *self, int tabsize)
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
{ … }

/*[clinic input]
str.find as unicode_find = str.count

Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].

Optional arguments start and end are interpreted as in slice notation.
Return -1 on failure.
[clinic start generated code]*/

static Py_ssize_t
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
                  Py_ssize_t end)
/*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/
{ … }

static PyObject *
unicode_getitem(PyObject *self, Py_ssize_t index)
{ … }

/* Believe it or not, this produces the same value for ASCII strings
   as bytes_hash(). */
static Py_hash_t
unicode_hash(PyObject *self)
{ … }

/*[clinic input]
str.index as unicode_index = str.count

Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].

Optional arguments start and end are interpreted as in slice notation.
Raises ValueError when the substring is not found.
[clinic start generated code]*/

static Py_ssize_t
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
                   Py_ssize_t end)
/*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/
{ … }

/*[clinic input]
str.isascii as unicode_isascii

Return True if all characters in the string are ASCII, False otherwise.

ASCII characters have code points in the range U+0000-U+007F.
Empty string is ASCII too.
[clinic start generated code]*/

static PyObject *
unicode_isascii_impl(PyObject *self)
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
{ … }

/*[clinic input]
str.islower as unicode_islower

Return True if the string is a lowercase string, False otherwise.

A string is lowercase if all cased characters in the string are lowercase and
there is at least one cased character in the string.
[clinic start generated code]*/

static PyObject *
unicode_islower_impl(PyObject *self)
/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
{ … }

/*[clinic input]
str.isupper as unicode_isupper

Return True if the string is an uppercase string, False otherwise.

A string is uppercase if all cased characters in the string are uppercase and
there is at least one cased character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isupper_impl(PyObject *self)
/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
{ … }

/*[clinic input]
str.istitle as unicode_istitle

Return True if the string is a title-cased string, False otherwise.

In a title-cased string, upper- and title-case characters may only
follow uncased characters and lowercase characters only cased ones.
[clinic start generated code]*/

static PyObject *
unicode_istitle_impl(PyObject *self)
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
{ … }

/*[clinic input]
str.isspace as unicode_isspace

Return True if the string is a whitespace string, False otherwise.

A string is whitespace if all characters in the string are whitespace and there
is at least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isspace_impl(PyObject *self)
/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
{ … }

/*[clinic input]
str.isalpha as unicode_isalpha

Return True if the string is an alphabetic string, False otherwise.

A string is alphabetic if all characters in the string are alphabetic and there
is at least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isalpha_impl(PyObject *self)
/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
{ … }

/*[clinic input]
str.isalnum as unicode_isalnum

Return True if the string is an alpha-numeric string, False otherwise.

A string is alpha-numeric if all characters in the string are alpha-numeric and
there is at least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isalnum_impl(PyObject *self)
/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
{ … }

/*[clinic input]
str.isdecimal as unicode_isdecimal

Return True if the string is a decimal string, False otherwise.

A string is a decimal string if all characters in the string are decimal and
there is at least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isdecimal_impl(PyObject *self)
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
{ … }

/*[clinic input]
str.isdigit as unicode_isdigit

Return True if the string is a digit string, False otherwise.

A string is a digit string if all characters in the string are digits and there
is at least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isdigit_impl(PyObject *self)
/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
{ … }

/*[clinic input]
str.isnumeric as unicode_isnumeric

Return True if the string is a numeric string, False otherwise.

A string is numeric if all characters in the string are numeric and there is at
least one character in the string.
[clinic start generated code]*/

static PyObject *
unicode_isnumeric_impl(PyObject *self)
/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
{ … }

Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject *self)
{ … }

int
PyUnicode_IsIdentifier(PyObject *self)
{ … }

/*[clinic input]
str.isidentifier as unicode_isidentifier

Return True if the string is a valid Python identifier, False otherwise.

Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
such as "def" or "class".
[clinic start generated code]*/

static PyObject *
unicode_isidentifier_impl(PyObject *self)
/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
{ … }

/*[clinic input]
str.isprintable as unicode_isprintable

Return True if the string is printable, False otherwise.

A string is printable if all of its characters are considered printable in
repr() or if it is empty.
[clinic start generated code]*/

static PyObject *
unicode_isprintable_impl(PyObject *self)
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
{ … }

/*[clinic input]
str.join as unicode_join

    iterable: object
    /

Concatenate any number of strings.

The string whose method is called is inserted in between each given string.
The result is returned as a new string.

Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
[clinic start generated code]*/

static PyObject *
unicode_join(PyObject *self, PyObject *iterable)
/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
{ … }

static Py_ssize_t
unicode_length(PyObject *self)
{ … }

/*[clinic input]
str.ljust as unicode_ljust

    width: Py_ssize_t
    fillchar: Py_UCS4 = ' '
    /

Return a left-justified string of length width.

Padding is done using the specified fill character (default is a space).
[clinic start generated code]*/

static PyObject *
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
{ … }

/*[clinic input]
str.lower as unicode_lower

Return a copy of the string converted to lowercase.
[clinic start generated code]*/

static PyObject *
unicode_lower_impl(PyObject *self)
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
{ … }

#define LEFTSTRIP …
#define RIGHTSTRIP …
#define BOTHSTRIP …

/* Arrays indexed by above */
static const char *stripfuncnames[] = …;

#define STRIPNAME(i) …

/* externally visible for str.strip(unicode) */
PyObject *
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
{ … }

PyObject*
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
{ … }

static PyObject *
do_strip(PyObject *self, int striptype)
{ … }


static PyObject *
do_argstrip(PyObject *self, int striptype, PyObject *sep)
{ … }


/*[clinic input]
str.strip as unicode_strip

    chars: object = None
    /

Return a copy of the string with leading and trailing whitespace removed.

If chars is given and not None, remove characters in chars instead.
[clinic start generated code]*/

static PyObject *
unicode_strip_impl(PyObject *self, PyObject *chars)
/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
{ … }


/*[clinic input]
str.lstrip as unicode_lstrip

    chars: object = None
    /

Return a copy of the string with leading whitespace removed.

If chars is given and not None, remove characters in chars instead.
[clinic start generated code]*/

static PyObject *
unicode_lstrip_impl(PyObject *self, PyObject *chars)
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
{ … }


/*[clinic input]
str.rstrip as unicode_rstrip

    chars: object = None
    /

Return a copy of the string with trailing whitespace removed.

If chars is given and not None, remove characters in chars instead.
[clinic start generated code]*/

static PyObject *
unicode_rstrip_impl(PyObject *self, PyObject *chars)
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
{ … }


static PyObject*
unicode_repeat(PyObject *str, Py_ssize_t len)
{ … }

PyObject *
PyUnicode_Replace(PyObject *str,
                  PyObject *substr,
                  PyObject *replstr,
                  Py_ssize_t maxcount)
{ … }

/*[clinic input]
str.replace as unicode_replace

    old: unicode
    new: unicode
    /
    count: Py_ssize_t = -1
        Maximum number of occurrences to replace.
        -1 (the default value) means replace all occurrences.

Return a copy with all occurrences of substring old replaced by new.

If the optional argument count is given, only the first count occurrences are
replaced.
[clinic start generated code]*/

static PyObject *
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
                     Py_ssize_t count)
/*[clinic end generated code: output=b63f1a8b5eebf448 input=3345c455d60a5499]*/
{ … }

/*[clinic input]
str.removeprefix as unicode_removeprefix

    prefix: unicode
    /

Return a str with the given prefix string removed if present.

If the string starts with the prefix string, return string[len(prefix):].
Otherwise, return a copy of the original string.
[clinic start generated code]*/

static PyObject *
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
{ … }

/*[clinic input]
str.removesuffix as unicode_removesuffix

    suffix: unicode
    /

Return a str with the given suffix string removed if present.

If the string ends with the suffix string and that suffix is not empty,
return string[:-len(suffix)]. Otherwise, return a copy of the original
string.
[clinic start generated code]*/

static PyObject *
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
{ … }

static PyObject *
unicode_repr(PyObject *unicode)
{ … }

/*[clinic input]
str.rfind as unicode_rfind = str.count

Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].

Optional arguments start and end are interpreted as in slice notation.
Return -1 on failure.
[clinic start generated code]*/

static Py_ssize_t
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
                   Py_ssize_t end)
/*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/
{ … }

/*[clinic input]
str.rindex as unicode_rindex = str.count

Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].

Optional arguments start and end are interpreted as in slice notation.
Raises ValueError when the substring is not found.
[clinic start generated code]*/

static Py_ssize_t
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
                    Py_ssize_t end)
/*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/
{ … }

/*[clinic input]
str.rjust as unicode_rjust

    width: Py_ssize_t
    fillchar: Py_UCS4 = ' '
    /

Return a right-justified string of length width.

Padding is done using the specified fill character (default is a space).
[clinic start generated code]*/

static PyObject *
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
{ … }

PyObject *
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
{ … }

/*[clinic input]
str.split as unicode_split

    sep: object = None
        The separator used to split the string.

        When set to None (the default value), will split on any whitespace
        character (including \n \r \t \f and spaces) and will discard
        empty strings from the result.
    maxsplit: Py_ssize_t = -1
        Maximum number of splits.
        -1 (the default value) means no limit.

Return a list of the substrings in the string, using sep as the separator string.

Splitting starts at the front of the string and works to the end.

Note, str.split() is mainly useful for data that has been intentionally
delimited.  With natural text that includes punctuation, consider using
the regular expression module.

[clinic start generated code]*/

static PyObject *
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
/*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
{ … }

PyObject *
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
{ … }


PyObject *
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
{ … }

/*[clinic input]
str.partition as unicode_partition

    sep: object
    /

Partition the string into three parts using the given separator.

This will search for the separator in the string.  If the separator is found,
returns a 3-tuple containing the part before the separator, the separator
itself, and the part after it.

If the separator is not found, returns a 3-tuple containing the original string
and two empty strings.
[clinic start generated code]*/

static PyObject *
unicode_partition(PyObject *self, PyObject *sep)
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
{ … }

/*[clinic input]
str.rpartition as unicode_rpartition = str.partition

Partition the string into three parts using the given separator.

This will search for the separator in the string, starting at the end. If
the separator is found, returns a 3-tuple containing the part before the
separator, the separator itself, and the part after it.

If the separator is not found, returns a 3-tuple containing two empty strings
and the original string.
[clinic start generated code]*/

static PyObject *
unicode_rpartition(PyObject *self, PyObject *sep)
/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
{ … }

PyObject *
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
{ … }

/*[clinic input]
str.rsplit as unicode_rsplit = str.split

Return a list of the substrings in the string, using sep as the separator string.

Splitting starts at the end of the string and works to the front.
[clinic start generated code]*/

static PyObject *
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
{ … }

/*[clinic input]
str.splitlines as unicode_splitlines

    keepends: bool = False

Return a list of the lines in the string, breaking at line boundaries.

Line breaks are not included in the resulting list unless keepends is given and
true.
[clinic start generated code]*/

static PyObject *
unicode_splitlines_impl(PyObject *self, int keepends)
/*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
{ … }

static
PyObject *unicode_str(PyObject *self)
{ … }

/*[clinic input]
str.swapcase as unicode_swapcase

Convert uppercase characters to lowercase and lowercase characters to uppercase.
[clinic start generated code]*/

static PyObject *
unicode_swapcase_impl(PyObject *self)
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
{ … }

/*[clinic input]

@staticmethod
str.maketrans as unicode_maketrans

  x: object

  y: unicode=NULL

  z: unicode=NULL

  /

Return a translation table usable for str.translate().

If there is only one argument, it must be a dictionary mapping Unicode
ordinals (integers) or characters to Unicode ordinals, strings or None.
Character keys will be then converted to ordinals.
If there are two arguments, they must be strings of equal length, and
in the resulting dictionary, each character in x will be mapped to the
character at the same position in y. If there is a third argument, it
must be a string, whose characters will be mapped to None in the result.
[clinic start generated code]*/

static PyObject *
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
{ … }

/*[clinic input]
str.translate as unicode_translate

    table: object
        Translation table, which must be a mapping of Unicode ordinals to
        Unicode ordinals, strings, or None.
    /

Replace each character in the string using the given translation table.

The table must implement lookup/indexing via __getitem__, for instance a
dictionary or list.  If this operation raises LookupError, the character is
left untouched.  Characters mapped to None are deleted.
[clinic start generated code]*/

static PyObject *
unicode_translate(PyObject *self, PyObject *table)
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
{ … }

/*[clinic input]
str.upper as unicode_upper

Return a copy of the string converted to uppercase.
[clinic start generated code]*/

static PyObject *
unicode_upper_impl(PyObject *self)
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
{ … }

/*[clinic input]
str.zfill as unicode_zfill

    width: Py_ssize_t
    /

Pad a numeric string with zeros on the left, to fill a field of the given width.

The string is never truncated.
[clinic start generated code]*/

static PyObject *
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
{ … }

/*[clinic input]
@text_signature "($self, prefix[, start[, end]], /)"
str.startswith as unicode_startswith

    prefix as subobj: object
        A string or a tuple of strings to try.
    start: slice_index(accept={int, NoneType}, c_default='0') = None
        Optional start position. Default: start of the string.
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
        Optional stop position. Default: end of the string.
    /

Return True if the string starts with the specified prefix, False otherwise.
[clinic start generated code]*/

static PyObject *
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
                        Py_ssize_t end)
/*[clinic end generated code: output=4bd7cfd0803051d4 input=5f918b5f5f89d856]*/
{ … }


/*[clinic input]
@text_signature "($self, suffix[, start[, end]], /)"
str.endswith as unicode_endswith

    suffix as subobj: object
        A string or a tuple of strings to try.
    start: slice_index(accept={int, NoneType}, c_default='0') = None
        Optional start position. Default: start of the string.
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
        Optional stop position. Default: end of the string.
    /

Return True if the string ends with the specified suffix, False otherwise.
[clinic start generated code]*/

static PyObject *
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
                      Py_ssize_t end)
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=00fbdc774a7d4d71]*/
{ … }


static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{ … }


void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{ … }


PyUnicodeWriter*
PyUnicodeWriter_Create(Py_ssize_t length)
{ … }


void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
{ … }


// Initialize _PyUnicodeWriter with initial buffer
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{ … }


int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                 Py_ssize_t length, Py_UCS4 maxchar)
{ … }

int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
                                     int kind)
{ … }

static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
{ … }

int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
{ … }

int
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
{ … }

int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
{ … }

int
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
{ … }


int
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
{ … }


int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
                                Py_ssize_t start, Py_ssize_t end)
{ … }


int
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
                               Py_ssize_t start, Py_ssize_t end)
{ … }


int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
                                  const char *ascii, Py_ssize_t len)
{ … }

int
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
                          const char *str,
                          Py_ssize_t size)
{ … }


int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
                                   const char *string,
                                   Py_ssize_t length,
                                   const char *errors,
                                   Py_ssize_t *consumed)
{ … }


int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                   const char *str, Py_ssize_t len)
{ … }

PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{ … }


PyObject*
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
{ … }


void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
{ … }

#include "stringlib/unicode_format.h"

PyDoc_STRVAR(format__doc__,
             "format($self, /, *args, **kwargs)\n\
--\n\
\n\
Return a formatted version of the string, using substitutions from args and kwargs.\n\
The substitutions are identified by braces ('{' and '}').");

PyDoc_STRVAR(format_map__doc__,
             "format_map($self, mapping, /)\n\
--\n\
\n\
Return a formatted version of the string, using substitutions from mapping.\n\
The substitutions are identified by braces ('{' and '}').");

/*[clinic input]
str.__format__ as unicode___format__

    format_spec: unicode
    /

Return a formatted version of the string as described by format_spec.
[clinic start generated code]*/

static PyObject *
unicode___format___impl(PyObject *self, PyObject *format_spec)
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
{ … }

/*[clinic input]
str.__sizeof__ as unicode_sizeof

Return the size of the string in memory, in bytes.
[clinic start generated code]*/

static PyObject *
unicode_sizeof_impl(PyObject *self)
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
{ … }

static PyObject *
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
{ … }

static PyMethodDef unicode_methods[] = …;

static PyObject *
unicode_mod(PyObject *v, PyObject *w)
{ … }

static PyNumberMethods unicode_as_number = …;

static PySequenceMethods unicode_as_sequence = …;

static PyObject*
unicode_subscript(PyObject* self, PyObject* item)
{ … }

static PyMappingMethods unicode_as_mapping = …;


/* Helpers for PyUnicode_Format() */

struct unicode_formatter_t { … };

struct unicode_format_arg_t { … };

static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
{ … }

/* Returns a new reference to a PyUnicode object, or NULL on failure. */

/* Format a float into the writer if the writer is not NULL, or into *p_output
   otherwise.

   Return 0 on success, raise an exception and return -1 on error. */
static int
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
            PyObject **p_output,
            _PyUnicodeWriter *writer)
{ … }

/* formatlong() emulates the format codes d, u, o, x and X, and
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
 * Python's regular ints.
 * Return value:  a new PyUnicodeObject*, or NULL if error.
 *     The output string is of the form
 *         "-"? ("0x" | "0X")? digit+
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
 *         set in flags.  The case of hex digits will be correct,
 *     There will be at least prec digits, zero-filled on the left if
 *         necessary to get that many.
 * val          object to be converted
 * flags        bitmask of format flags; only F_ALT is looked at
 * prec         minimum number of digits; 0-fill on left if needed
 * type         a character in [duoxX]; u acts the same as d
 *
 * CAUTION:  o, x and X conversions on regular ints can never
 * produce a '-' sign, but can for Python's unbounded ints.
 */
PyObject *
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
{ … }

/* Format an integer or a float as an integer.
 * Return 1 if the number has been formatted into the writer,
 *        0 if the number has been formatted into *p_output
 *       -1 and raise an exception on error */
static int
mainformatlong(PyObject *v,
               struct unicode_format_arg_t *arg,
               PyObject **p_output,
               _PyUnicodeWriter *writer)
{ … }

static Py_UCS4
formatchar(PyObject *v)
{ … }

/* Parse options of an argument: flags, width, precision.
   Handle also "%(name)" syntax.

   Return 0 if the argument has been formatted into arg->str.
   Return 1 if the argument has been written into ctx->writer,
   Raise an exception and return -1 on error. */
static int
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
                         struct unicode_format_arg_t *arg)
{ … }

/* Format one argument. Supported conversion specifiers:

   - "s", "r", "a": any type
   - "i", "d", "u": int or float
   - "o", "x", "X": int
   - "e", "E", "f", "F", "g", "G": float
   - "c": int or str (1 character)

   When possible, the output is written directly into the Unicode writer
   (ctx->writer). A string is created when padding is required.

   Return 0 if the argument has been formatted into *p_str,
          1 if the argument has been written into ctx->writer,
         -1 on error. */
static int
unicode_format_arg_format(struct unicode_formatter_t *ctx,
                          struct unicode_format_arg_t *arg,
                          PyObject **p_str)
{ … }

static int
unicode_format_arg_output(struct unicode_formatter_t *ctx,
                          struct unicode_format_arg_t *arg,
                          PyObject *str)
{ … }

/* Helper of PyUnicode_Format(): format one arg.
   Return 0 on success, raise an exception and return -1 on error. */
static int
unicode_format_arg(struct unicode_formatter_t *ctx)
{ … }

PyObject *
PyUnicode_Format(PyObject *format, PyObject *args)
{ … }

static PyObject *
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);

/*[clinic input]
@classmethod
str.__new__ as unicode_new

    object as x: object = NULL
    encoding: str = NULL
    errors: str = NULL

[clinic start generated code]*/

static PyObject *
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
                 const char *errors)
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
{ … }

static const char *
arg_as_utf8(PyObject *obj, const char *name)
{ … }

static PyObject *
unicode_vectorcall(PyObject *type, PyObject *const *args,
                   size_t nargsf, PyObject *kwnames)
{ … }

static PyObject *
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
{ … }

void
_PyUnicode_ExactDealloc(PyObject *op)
{ … }

PyDoc_STRVAR(unicode_doc,
"str(object='') -> str\n\
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
\n\
Create a new string object from the given object. If encoding or\n\
errors is specified, then the object must expose a data buffer\n\
that will be decoded using the given encoding and error handler.\n\
Otherwise, returns the result of object.__str__() (if defined)\n\
or repr(object).\n\
encoding defaults to 'utf-8'.\n\
errors defaults to 'strict'.");

static PyObject *unicode_iter(PyObject *seq);

PyTypeObject PyUnicode_Type = …;

/* Initialize the Unicode implementation */

static void
_init_global_state(void)
{ … }

void
_PyUnicode_InitState(PyInterpreterState *interp)
{ … }


PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
{ … }


PyStatus
_PyUnicode_InitTypes(PyInterpreterState *interp)
{ … }

static /* non-null */ PyObject*
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
{ … }

void
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
{ … }

static void
immortalize_interned(PyObject *s)
{ … }

static /* non-null */ PyObject*
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
              bool immortalize)
{ … }

void
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
{ … }

void
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
{ … }


void
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
{ … }

void
PyUnicode_InternInPlace(PyObject **p)
{ … }

// Public-looking name kept for the stable ABI; user should not call this:
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
void
PyUnicode_InternImmortal(PyObject **p)
{ … }

PyObject *
PyUnicode_InternFromString(const char *cp)
{ … }


void
_PyUnicode_ClearInterned(PyInterpreterState *interp)
{ … }


/********************* Unicode Iterator **************************/

unicodeiterobject;

static void
unicodeiter_dealloc(unicodeiterobject *it)
{ … }

static int
unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
{ … }

static PyObject *
unicodeiter_next(unicodeiterobject *it)
{ … }

static PyObject *
unicode_ascii_iter_next(unicodeiterobject *it)
{ … }

static PyObject *
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
{ … }

PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");

static PyObject *
unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
{ … }

PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");

static PyObject *
unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
{ … }

PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");

static PyMethodDef unicodeiter_methods[] = …;

PyTypeObject PyUnicodeIter_Type = …;

PyTypeObject _PyUnicodeASCIIIter_Type = …;

static PyObject *
unicode_iter(PyObject *seq)
{ … }

static int
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
{ … }


static int
config_get_codec_name(wchar_t **config_encoding)
{ … }


static PyStatus
init_stdio_encoding(PyInterpreterState *interp)
{ … }


static int
init_fs_codec(PyInterpreterState *interp)
{ … }


static PyStatus
init_fs_encoding(PyThreadState *tstate)
{ … }


PyStatus
_PyUnicode_InitEncodings(PyThreadState *tstate)
{ … }


static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
{ … }


#ifdef MS_WINDOWS
int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
{
    PyInterpreterState *interp = _PyInterpreterState_GET();
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);

    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
    if (encoding == NULL || errors == NULL) {
        PyMem_RawFree(encoding);
        PyMem_RawFree(errors);
        PyErr_NoMemory();
        return -1;
    }

    PyMem_RawFree(config->filesystem_encoding);
    config->filesystem_encoding = encoding;
    PyMem_RawFree(config->filesystem_errors);
    config->filesystem_errors = errors;

    return init_fs_codec(interp);
}
#endif


#ifdef Py_DEBUG
static inline int
unicode_is_finalizing(void)
{
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
}
#endif


void
_PyUnicode_FiniTypes(PyInterpreterState *interp)
{ … }


void
_PyUnicode_Fini(PyInterpreterState *interp)
{ … }

/* A _string module, to export formatter_parser and formatter_field_name_split
   to the string.Formatter class implemented in Python. */

static PyMethodDef _string_methods[] = …;

static PyModuleDef_Slot module_slots[] = …;

static struct PyModuleDef _string_module = …;

PyMODINIT_FUNC
PyInit__string(void)
{ … }
cpython/Objects/unicodeobject.c