// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2014 SGI. * All rights reserved. */ #include "utf8n.h" int utf8version_is_supported(const struct unicode_map *um, unsigned int version) { … } /* * UTF-8 valid ranges. * * The UTF-8 encoding spreads the bits of a 32bit word over several * bytes. This table gives the ranges that can be held and how they'd * be represented. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * There is an additional requirement on UTF-8, in that only the * shortest representation of a 32bit value is to be used. A decoder * must not decode sequences that do not satisfy this requirement. * Thus the allowed ranges have a lower bound. * * 0x00000000 0x0000007F: 0xxxxxxx * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, * 17 planes of 65536 values. This limits the sequences actually seen * even more, to just the following. * * 0 - 0x7F: 0 - 0x7F * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF * * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. * * Note that the longest sequence seen with valid usage is 4 bytes, * the same a single UTF-32 character. This makes the UTF-8 * representation of Unicode strictly smaller than UTF-32. * * The shortest sequence requirement was introduced by: * Corrigendum #1: UTF-8 Shortest Form * It can be found here: * http://www.unicode.org/versions/corrigendum1.html * */ /* * Return the number of bytes used by the current UTF-8 sequence. * Assumes the input points to the first byte of a valid UTF-8 * sequence. */ static inline int utf8clen(const char *s) { … } /* * Decode a 3-byte UTF-8 sequence. */ static unsigned int utf8decode3(const char *str) { … } /* * Encode a 3-byte UTF-8 sequence. */ static int utf8encode3(char *str, unsigned int val) { … } /* * utf8trie_t * * A compact binary tree, used to decode UTF-8 characters. * * Internal nodes are one byte for the node itself, and up to three * bytes for an offset into the tree. The first byte contains the * following information: * NEXTBYTE - flag - advance to next byte if set * BITNUM - 3 bit field - the bit number to tested * OFFLEN - 2 bit field - number of bytes in the offset * if offlen == 0 (non-branching node) * RIGHTPATH - 1 bit field - set if the following node is for the * right-hand path (tested bit is set) * TRIENODE - 1 bit field - set if the following node is an internal * node, otherwise it is a leaf node * if offlen != 0 (branching node) * LEFTNODE - 1 bit field - set if the left-hand node is internal * RIGHTNODE - 1 bit field - set if the right-hand node is internal * * Due to the way utf8 works, there cannot be branching nodes with * NEXTBYTE set, and moreover those nodes always have a righthand * descendant. */ utf8trie_t; #define BITNUM … #define NEXTBYTE … #define OFFLEN … #define OFFLEN_SHIFT … #define RIGHTPATH … #define TRIENODE … #define RIGHTNODE … #define LEFTNODE … /* * utf8leaf_t * * The leaves of the trie are embedded in the trie, and so the same * underlying datatype: unsigned char. * * leaf[0]: The unicode version, stored as a generation number that is * an index into ->utf8agetab[]. With this we can filter code * points based on the unicode version in which they were * defined. The CCC of a non-defined code point is 0. * leaf[1]: Canonical Combining Class. During normalization, we need * to do a stable sort into ascending order of all characters * with a non-zero CCC that occur between two characters with * a CCC of 0, or at the begin or end of a string. * The unicode standard guarantees that all CCC values are * between 0 and 254 inclusive, which leaves 255 available as * a special value. * Code points with CCC 0 are known as stoppers. * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the * start of a NUL-terminated string that is the decomposition * of the character. * The CCC of a decomposable character is the same as the CCC * of the first character of its decomposition. * Some characters decompose as the empty string: these are * characters with the Default_Ignorable_Code_Point property. * These do affect normalization, as they all have CCC 0. * * The decompositions in the trie have been fully expanded, with the * exception of Hangul syllables, which are decomposed algorithmically. * * Casefolding, if applicable, is also done using decompositions. * * The trie is constructed in such a way that leaves exist for all * UTF-8 sequences that match the criteria from the "UTF-8 valid * ranges" comment above, and only for those sequences. Therefore a * lookup in the trie can be used to validate the UTF-8 input. */ utf8leaf_t; #define LEAF_GEN(LEAF) … #define LEAF_CCC(LEAF) … #define LEAF_STR(LEAF) … #define MINCCC … #define MAXCCC … #define STOPPER … #define DECOMPOSE … /* Marker for hangul syllable decomposition. */ #define HANGUL … /* Size of the synthesized leaf used for Hangul syllable decomposition. */ #define UTF8HANGULLEAF … /* * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) * * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; * * SBase = 0xAC00 * LBase = 0x1100 * VBase = 0x1161 * TBase = 0x11A7 * LCount = 19 * VCount = 21 * TCount = 28 * NCount = 588 (VCount * TCount) * SCount = 11172 (LCount * NCount) * * Decomposition: * SIndex = s - SBase * * LV (Canonical/Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * LPart = LBase + LIndex * VPart = VBase + VIndex * * LVT (Canonical) * LVIndex = (SIndex / TCount) * TCount * TIndex = (Sindex % TCount) * LVPart = SBase + LVIndex * TPart = TBase + TIndex * * LVT (Full) * LIndex = SIndex / NCount * VIndex = (Sindex % NCount) / TCount * TIndex = (Sindex % TCount) * LPart = LBase + LIndex * VPart = VBase + VIndex * if (TIndex == 0) { * d = <LPart, VPart> * } else { * TPart = TBase + TIndex * d = <LPart, TPart, VPart> * } */ /* Constants */ #define SB … #define LB … #define VB … #define TB … #define LC … #define VC … #define TC … #define NC … #define SC … /* Algorithmic decomposition of hangul syllable. */ static utf8leaf_t * utf8hangul(const char *str, unsigned char *hangul) { … } /* * Use trie to scan s, touching at most len bytes. * Returns the leaf if one exists, NULL otherwise. * * A non-NULL return guarantees that the UTF-8 sequence starting at s * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ static utf8leaf_t *utf8nlookup(const struct unicode_map *um, enum utf8_normalization n, unsigned char *hangul, const char *s, size_t len) { … } /* * Use trie to scan s. * Returns the leaf if one exists, NULL otherwise. * * Forwards to utf8nlookup(). */ static utf8leaf_t *utf8lookup(const struct unicode_map *um, enum utf8_normalization n, unsigned char *hangul, const char *s) { … } /* * Length of the normalization of s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, const char *s, size_t len) { … } /* * Set up an utf8cursor for use by utf8byte(). * * u8c : pointer to cursor. * data : const struct utf8data to use for normalization. * s : string. * len : length of s. * * Returns -1 on error, 0 on success. */ int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, enum utf8_normalization n, const char *s, size_t len) { … } /* * Get one byte from the normalized form of the string described by u8c. * * Returns the byte cast to an unsigned char on succes, and -1 on failure. * * The cursor keeps track of the location in the string in u8c->s. * When a character is decomposed, the current location is stored in * u8c->p, and u8c->s is set to the start of the decomposition. Note * that bytes from a decomposition do not count against u8c->len. * * Characters are emitted if they match the current CCC in u8c->ccc. * Hitting end-of-string while u8c->ccc == STOPPER means we're done, * and the function returns 0 in that case. * * Sorting by CCC is done by repeatedly scanning the string. The * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at * the start of the scan. The first pass finds the lowest CCC to be * emitted and stores it in u8c->nccc, the second pass emits the * characters with this CCC and finds the next lowest CCC. This limits * the number of passes to 1 + the number of different CCCs in the * sequence being scanned. * * Therefore: * u8c->p != NULL -> a decomposition is being scanned. * u8c->ss != NULL -> this is a repeating scan. * u8c->ccc == -1 -> this is the first scan of a repeating scan. */ int utf8byte(struct utf8cursor *u8c) { … } #ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); EXPORT_SYMBOL_GPL(utf8byte); #endif