unicodedata.c | Explore in Territory

/* ------------------------------------------------------------------------

   unicodedata -- Provides access to the Unicode database.

   The current version number is reported in the unidata_version constant.

   Written by Marc-Andre Lemburg ([email protected]).
   Modified for Python 2.0 by Fredrik Lundh ([email protected])
   Modified by Martin v. Löwis ([email protected])

   Copyright (c) Corporation for National Research Initiatives.

   ------------------------------------------------------------------------ */

#ifndef Py_BUILD_CORE_BUILTIN
#define Py_BUILD_CORE_MODULE …
#endif

#include "Python.h"
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI

#include <stdbool.h>
#include <stddef.h>               // offsetof()

/*[clinic input]
module unicodedata
class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/

/* character properties */

_PyUnicode_DatabaseRecord;

change_record;

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"

static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)
{ … }

/* ------------- Previous-version API ------------------------------------- */
PreviousDBVersion;

#include "clinic/unicodedata.c.h"

#define get_old_record(self, v) …

static PyMemberDef DB_members[] = …;

// Check if self is an unicodedata.UCD instance.
// If self is NULL (when the PyCapsule C API is used), return 0.
// PyModule_Check() is used to avoid having to retrieve the ucd_type.
// See unicodedata_functions comment to the rationale of this macro.
#define UCD_Check(self) …

static PyObject*
new_previous_version(PyTypeObject *ucd_type,
                     const char*name, const change_record* (*getrecord)(Py_UCS4),
                     Py_UCS4 (*normalization)(Py_UCS4))
{ … }


/* --- Module API --------------------------------------------------------- */

/*[clinic input]
unicodedata.UCD.decimal

    self: self
    chr: int(accept={str})
    default: object=NULL
    /

Converts a Unicode character into its equivalent decimal value.

Returns the decimal value assigned to the character chr as integer.
If no such value is defined, default is returned, or, if not given,
ValueError is raised.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_decimal_impl(PyObject *self, int chr,
                             PyObject *default_value)
/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
{ … }

/*[clinic input]
unicodedata.UCD.digit

    self: self
    chr: int(accept={str})
    default: object=NULL
    /

Converts a Unicode character into its equivalent digit value.

Returns the digit value assigned to the character chr as integer.
If no such value is defined, default is returned, or, if not given,
ValueError is raised.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
{ … }

/*[clinic input]
unicodedata.UCD.numeric

    self: self
    chr: int(accept={str})
    default: object=NULL
    /

Converts a Unicode character into its equivalent numeric value.

Returns the numeric value assigned to the character chr as float.
If no such value is defined, default is returned, or, if not given,
ValueError is raised.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_numeric_impl(PyObject *self, int chr,
                             PyObject *default_value)
/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
{ … }

/*[clinic input]
unicodedata.UCD.category

    self: self
    chr: int(accept={str})
    /

Returns the general category assigned to the character chr as string.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_category_impl(PyObject *self, int chr)
/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
{ … }

/*[clinic input]
unicodedata.UCD.bidirectional

    self: self
    chr: int(accept={str})
    /

Returns the bidirectional class assigned to the character chr as string.

If no such value is defined, an empty string is returned.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
{ … }

/*[clinic input]
unicodedata.UCD.combining -> int

    self: self
    chr: int(accept={str})
    /

Returns the canonical combining class assigned to the character chr as integer.

Returns 0 if no combining class is defined.
[clinic start generated code]*/

static int
unicodedata_UCD_combining_impl(PyObject *self, int chr)
/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
{ … }

/*[clinic input]
unicodedata.UCD.mirrored -> int

    self: self
    chr: int(accept={str})
    /

Returns the mirrored property assigned to the character chr as integer.

Returns 1 if the character has been identified as a "mirrored"
character in bidirectional text, 0 otherwise.
[clinic start generated code]*/

static int
unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
{ … }

/*[clinic input]
unicodedata.UCD.east_asian_width

    self: self
    chr: int(accept={str})
    /

Returns the east asian width assigned to the character chr as string.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
{ … }

/*[clinic input]
unicodedata.UCD.decomposition

    self: self
    chr: int(accept={str})
    /

Returns the character decomposition mapping assigned to the character chr as string.

An empty string is returned in case no such mapping is defined.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
{ … }

static void
get_decomp_record(PyObject *self, Py_UCS4 code,
                  int *index, int *prefix, int *count)
{ … }

#define SBase …
#define LBase …
#define VBase …
#define TBase …
#define LCount …
#define VCount …
#define TCount …
#define NCount …
#define SCount …

static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{ … }

static int
find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
{ … }

static PyObject*
nfc_nfkc(PyObject *self, PyObject *input, int k)
{ … }

// This needs to match the logic in makeunicodedata.py
// which constructs the quickcheck data.
QuickcheckResult;

/* Run the Unicode normalization "quickcheck" algorithm.
 *
 * Return YES or NO if quickcheck determines the input is certainly
 * normalized or certainly not, and MAYBE if quickcheck is unable to
 * tell.
 *
 * If `yes_only` is true, then return MAYBE as soon as we determine
 * the answer is not YES.
 *
 * For background and details on the algorithm, see UAX #15:
 *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
 */
static QuickcheckResult
is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
                         bool yes_only)
{ … }

/*[clinic input]
unicodedata.UCD.is_normalized

    self: self
    form: unicode
    unistr as input: unicode
    /

Return whether the Unicode string unistr is in the normal form 'form'.

Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
                                   PyObject *input)
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
{ … }


/*[clinic input]
unicodedata.UCD.normalize

    self: self
    form: unicode
    unistr as input: unicode
    /

Return the normal form 'form' for the Unicode string unistr.

Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
                               PyObject *input)
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
{ … }

/* -------------------------------------------------------------------- */
/* unicode character name tables */

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"

/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */

static const char * const hangul_syllables[][3] = …;

/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
{ … }

/* macros used to determine if the given code point is in the PUA range that
 * we are using to store aliases and named sequences */
#define IS_ALIAS(cp) …
#define IS_NAMED_SEQ(cp) …


// DAWG decoding functions

static unsigned int
_dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
{ … }

static int
_dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
                 unsigned int label_offset, unsigned int namepos)
{ … }

// reading DAWG node information:
// a node is encoded by a varint. The lowest bit of that int is set if the node
// is a final, accepting state. The higher bits of that int represent the
// number of names that are encoded by the sub-DAWG started by this node. It's
// used to compute the position of a name.
//
// the starting node of the DAWG is at position 0.
//
// the varint representing a node is followed by the node's edges, the encoding
// is described below


static unsigned int
_dawg_decode_node(unsigned int node_offset, bool* final)
{ … }

static bool
_dawg_node_is_final(unsigned int node_offset)
{ … }

static unsigned int
_dawg_node_descendant_count(unsigned int node_offset)
{ … }


// reading DAWG edge information:
// a DAWG edge is comprised of the following information:
// (1) the size of the label of the string attached to the edge
// (2) the characters of that edge
// (3) the target node
// (4) whether the edge is the last edge in the list of edges following a node
//
// this information is encoded in a compact form as follows:
//
// +---------+-----------------+--------------+--------------------
// |  varint | size (if != 1)  | label chars  | ... next edge ...
// +---------+-----------------+--------------+--------------------
//
// - first comes a varint
//     - the lowest bit of that varint is whether the edge is final (4)
//     - the second lowest bit of that varint is true if the size of
//       the length of the label is 1 (1)
//     - the rest of the varint is an offset that can be used to compute
//       the offset of the target node of that edge (3)
//  - if the size is not 1, the first varint is followed by a
//    character encoding the number of characters of the label (1)
//    (unicode character names aren't larger than 256 bytes, therefore each
//    edge label can be at most 256 chars, but is usually smaller)
//  - the next size bytes are the characters of the label (2)
//
// the offset of the target node is computed as follows: the number in the
// upper bits of the varint needs to be added to the offset of the target node
// of the previous edge. For the first edge, where there is no previous target
// node, the offset of the first edge is used.
// The intuition here is that edges going out from a node often lead to nodes
// that are close by, leading to small offsets from the current node and thus
// fewer bytes.
//
// There is a special case: if a final node has no outgoing edges, it has to be
// followed by a 0 byte to indicate that there are no edges (because the end of
// the edge list is normally indicated in a bit in the edge encoding). This is
// indicated by _dawg_decode_edge returning -1


static int
_dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
                  unsigned int edge_offset, unsigned int* size,
                  unsigned int* label_offset, unsigned int* target_node_offset)
{ … }

static int
_lookup_dawg_packed(const char* name, unsigned int namelen)
{ … }

static int
_inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
{ … }


static int
_getucname(PyObject *self,
           Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
{ … }

static int
capi_getucname(Py_UCS4 code,
               char* buffer, int buflen,
               int with_alias_and_seq)
{ … }

static void
find_syllable(const char *str, int *len, int *pos, int count, int column)
{ … }

static int
_check_alias_and_seq(Py_UCS4* code, int with_named_seq)
{ … }


static int
_getcode(const char* name, int namelen, Py_UCS4* code)
{ … }


static int
capi_getcode(const char* name, int namelen, Py_UCS4* code,
             int with_named_seq)
{ … }

static void
unicodedata_destroy_capi(PyObject *capsule)
{ … }

static PyObject *
unicodedata_create_capi(void)
{
    _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
    if (capi == NULL) {
        PyErr_NoMemory();
        return NULL;
    }
    capi->getname = capi_getucname;
    capi->getcode = capi_getcode;

    PyObject *capsule = PyCapsule_New(capi,
                                      PyUnicodeData_CAPSULE_NAME,
                                      unicodedata_destroy_capi);
    if (capsule == NULL) {
        PyMem_Free(capi);
    }
    return capsule;
};


/* -------------------------------------------------------------------- */
/* Python bindings */

/*[clinic input]
unicodedata.UCD.name

    self: self
    chr: int(accept={str})
    default: object=NULL
    /

Returns the name assigned to the character chr as a string.

If no name is defined, default is returned, or, if not given,
ValueError is raised.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
{ … }

/*[clinic input]
unicodedata.UCD.lookup

    self: self
    name: str(accept={str, robuffer}, zeroes=True)
    /

Look up character by name.

If a character with the given name is found, return the
corresponding character.  If not found, KeyError is raised.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
                            Py_ssize_t name_length)
/*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
{ … }

// List of functions used to define module functions *AND* unicodedata.UCD
// methods. For module functions, self is the module. For UCD methods, self
// is an UCD instance. The UCD_Check() macro is used to check if self is
// an UCD instance.
static PyMethodDef unicodedata_functions[] = …;

static int
ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
{ … }

static void
ucd_dealloc(PreviousDBVersion *self)
{ … }

static PyType_Slot ucd_type_slots[] = …;

static PyType_Spec ucd_type_spec = …;

PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
this database is based on the UnicodeData.txt file version\n\
" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
\n\
The module uses the same names and symbols as defined by the\n\
UnicodeData File Format " UNIDATA_VERSION ".");

static int
unicodedata_exec(PyObject *module)
{ … }

static PyModuleDef_Slot unicodedata_slots[] = …;

static struct PyModuleDef unicodedata_module = …;

PyMODINIT_FUNC
PyInit_unicodedata(void)
{ … }


/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
cpython/Modules/unicodedata.c