// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: utrie2_builder.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2008sep26 (split off from utrie2.c) * created by: Markus W. Scherer * * This is a common implementation of a Unicode trie. * It is a kind of compressed, serializable table of 16- or 32-bit values associated with * Unicode code points (0..0x10ffff). * This is the second common version of a Unicode trie (hence the name UTrie2). * See utrie2.h for a comparison. * * This file contains only the builder code. * See utrie2.c for the runtime and enumeration code. */ // #define UTRIE2_DEBUG #ifdef UTRIE2_DEBUG # include <stdio.h> #endif // #define UCPTRIE_DEBUG #include "unicode/utypes.h" #ifdef UCPTRIE_DEBUG #include "unicode/ucptrie.h" #include "unicode/umutablecptrie.h" #include "ucptrie_impl.h" #endif #include "cmemory.h" #include "utrie2.h" #include "utrie2_impl.h" #include "utrie.h" // for utrie2_fromUTrie() /* Implementation notes ----------------------------------------------------- */ /* * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values * have been chosen to minimize trie sizes overall. * Most of the code is flexible enough to work with a range of values, * within certain limits. * * Exception: Support for separate values for lead surrogate code _units_ * vs. code _points_ was added after the constants were fixed, * and has not been tested nor particularly designed for different constant values. * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 * part and back.) * * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction * remapping) stops working. * * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() * assumes that a single index-2 block is used for 0x400 code points * corresponding to one lead surrogate. * * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains * more than one Unicode plane, and the split of the index-2 table into a BMP * part and a supplementary part, with a gap in between, would not work. * * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because * there is data with more than 64k distinct values, * for example for Unihan collation with a separate collation weight per * Han character. */ /* Building a trie ----------------------------------------------------------*/ enum { … }; /* Start with allocation of 16k data entries. */ #define UNEWTRIE2_INITIAL_DATA_LENGTH … /* Grow about 8x each time. */ #define UNEWTRIE2_MEDIUM_DATA_LENGTH … static int32_t allocIndex2Block(UNewTrie2 *trie); U_CAPI UTrie2 * U_EXPORT2 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { … } static UNewTrie2 * cloneBuilder(const UNewTrie2 *other) { … } U_CAPI UTrie2 * U_EXPORT2 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { … } NewTrieAndStatus; static UBool U_CALLCONV copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { … } #ifdef UTRIE2_DEBUG static long countInitial(const UTrie2 *trie) { uint32_t initialValue=trie->initialValue; int32_t length=trie->dataLength; long count=0; if(trie->data16!=nullptr) { for(int32_t i=0; i<length; ++i) { if(trie->data16[i]==initialValue) { ++count; } } } else { for(int32_t i=0; i<length; ++i) { if(trie->data32[i]==initialValue) { ++count; } } } return count; } static void utrie_printLengths(const UTrie *trie) { long indexLength=trie->indexLength; long dataLength=(long)trie->dataLength; long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2); printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", indexLength, dataLength, totalLength); } static void utrie2_printLengths(const UTrie2 *trie, const char *which) { long indexLength=trie->indexLength; long dataLength=(long)trie->dataLength; long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2); printf("**UTrie2Lengths(%s %s)** index:%6ld data:%6ld countInitial:%6ld serialized:%6ld\n", which, trie->name, indexLength, dataLength, countInitial(trie), totalLength); } #endif U_CAPI UTrie2 * U_EXPORT2 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { … } /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ U_CAPI UTrie2 * U_EXPORT2 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { … } static inline UBool isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { … } static int32_t allocIndex2Block(UNewTrie2 *trie) { … } static int32_t getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { … } static int32_t allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { … } /* call when the block's reference counter reaches 0 */ static void releaseDataBlock(UNewTrie2 *trie, int32_t block) { … } static inline UBool isWritableBlock(UNewTrie2 *trie, int32_t block) { … } static inline void setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { … } /** * No error checking for illegal arguments. * * @return -1 if no new data block available (out of memory in data array) * @internal */ static int32_t getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { … } /** * @return true if the value was successfully set */ static void set32(UNewTrie2 *trie, UChar32 c, UBool forLSCP, uint32_t value, UErrorCode *pErrorCode) { … } U_CAPI void U_EXPORT2 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { … } U_CAPI void U_EXPORT2 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { … } static void writeBlock(uint32_t *block, uint32_t value) { … } /** * initialValue is ignored if overwrite=true * @internal */ static void fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value, uint32_t initialValue, UBool overwrite) { … } U_CAPI void U_EXPORT2 utrie2_setRange32(UTrie2 *trie, UChar32 start, UChar32 end, uint32_t value, UBool overwrite, UErrorCode *pErrorCode) { … } /* compaction --------------------------------------------------------------- */ static inline UBool equal_int32(const int32_t *s, const int32_t *t, int32_t length) { … } static inline UBool equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { … } static int32_t findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { … } static int32_t findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { … } /* * Find the start of the last range in the trie by enumerating backward. * Indexes for supplementary code points higher than this will be omitted. */ static UChar32 findHighStart(UNewTrie2 *trie, uint32_t highValue) { … } /* * Compact a build-time trie. * * The compaction * - removes blocks that are identical with earlier ones * - overlaps adjacent blocks as much as possible (if overlap==true) * - moves blocks in steps of the data granularity * - moves and overlaps blocks that overlap with multiple values in the overlap region * * It does not * - try to move and overlap blocks that are not already adjacent */ static void compactData(UNewTrie2 *trie) { … } static void compactIndex2(UNewTrie2 *trie) { … } static void compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { … } /* serialization ------------------------------------------------------------ */ /** * Maximum length of the runtime index array. * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. * (The actual maximum length is lower, * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) */ #define UTRIE2_MAX_INDEX_LENGTH … /** * Maximum length of the runtime data array. * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, * and by uint16_t UTrie2Header.shiftedDataLength. */ #define UTRIE2_MAX_DATA_LENGTH … /* Compact and internally serialize the trie. */ U_CAPI void U_EXPORT2 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { … }