// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdatareader.h * * created on: 2013feb07 * created by: Markus W. Scherer */ #ifndef __COLLATIONDATAREADER_H__ #define __COLLATIONDATAREADER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/udata.h" struct UDataMemory; U_NAMESPACE_BEGIN struct CollationTailoring; /** * Collation binary data reader. */ struct U_I18N_API CollationDataReader /* all static */ { … }; /* * Format of collation data (ucadata.icu, binary data in coll/ *.res files). * Format version 5. * * The root collation data is stored in the ucadata.icu file. * Tailorings are stored inside .res resource bundle files, with a complete file header. * * Collation data begins with a standard ICU data file header * (DataHeader, see ucmndata.h and unicode/udata.h). * The UDataInfo.dataVersion field contains the UCA and other version numbers, * see the comments for CollationTailoring.version. * * After the header, the file contains the following parts. * Constants are defined as enum values of the CollationDataReader class. * See also the Collation class. * * int32_t indexes[indexesLength]; * The indexes array has variable length. * Some tailorings only need the length and the options, * others only add reorderCodes and the reorderTable, * some need to store mappings. * Only as many indexes are stored as needed to read all of the data. * * Index 0: indexesLength * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS * Index 2..3: Unused/reserved/0. * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo * are stored in a short, contiguous part of the ce32s array. * * Indexes 5..19 are byte offsets in ascending order. * Each byte offset marks the start of the next part in the data file, * and the end of the previous one. * When two consecutive byte offsets are the same (or too short), * then the corresponding part is empty. * Byte offsets are offsets from after the header, * that is, from the beginning of the indexes[]. * Each part starts at an offset with proper alignment for its data. * If necessary, the previous part may include padding bytes to achieve this alignment. * The last byte offset that is stored in the indexes indicates the total size of the data * (starting with the indexes). * * int32_t reorderCodes[]; -- empty in root * The list of script and reordering codes. * * Beginning with format version 5, this array may optionally * have trailing entries with a full list of reorder ranges * as described for CollationSettings::reorderRanges. * * Script or reorder codes are first and do not exceed 16-bit values. * Range limits are stored in the upper 16 bits, and are never 0. * Split this array into reorder codes and ranges at the first entry * with non-zero upper 16 bits. * * If the ranges are missing but needed for split-reordered primary lead bytes, * then they are regenerated at load time. * * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes * Primary-weight lead byte permutation table. * Normally present when the reorderCodes are, but can be built at load time. * * Beginning with format version 5, a 0 entry at a non-zero index * (which is otherwise an illegal value) * means that the primary lead byte is "split" * (there are different offsets for primaries that share that lead byte) * and the reordering offset must be determined via the reorder ranges * that are either stored as part of the reorderCodes array * or regenerated at load time. * * UTrie2 trie; -- see utrie2_impl.h and utrie2.h * The trie holds the main collation data. Each code point is mapped to a 32-bit value. * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, * in which case it is a special CE32 and contains a 4-bit tag and further data. * See the Collation class for details. * * The trie has a value for each lead surrogate code unit with some bits encoding * collective properties of the 1024 supplementary characters whose UTF-16 form starts with * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. * * int64_t ces[]; * 64-bit CEs and expansions that cannot be stored in a more compact form. * * uint32_t ce32s[]; * CE32s for expansions in compact form, and for characters whose trie values * contain special data. * * uint32_t rootElements[]; -- empty in all tailorings * Compact storage for all of the CEs that occur in the root collation. * See the CollationRootElements class. * * char16_t *contexts[]; * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. * * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() * Serialized form of characters that are unsafe when iterating backwards, * and at the end of an identical string prefix. * Back up to a safe character. * Lead surrogates are "unsafe" when any of their corresponding supplementary * code points are unsafe. * Does not include [:^lccc=0:][:^tccc=0:]. * For each tailoring, the root unsafeBackwardSet is subtracted. * (As a result, in many tailorings no set needs to be stored.) * * uint16_t fastLatinTable[]; * Optional optimization for Latin text. * See the CollationFastLatin class. * * uint16_t scripts[]; -- empty in all tailorings * Format version 5: * uint16_t numScripts; * uint16_t scriptsIndex[numScripts+16]; * uint16_t scriptStarts[]; * See CollationData::numScripts etc. * * Format version 4: * Table of the reordering groups with their first and last lead bytes, * and their script and reordering codes. * See CollationData::scripts. * * UBool compressibleBytes[]; -- empty in all tailorings * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. * * ----------------- * Changes for formatVersion 5 (ICU 55) * * Reordering moves single scripts, not groups of scripts. * Reorder ranges are optionally appended to the reorderCodes, * and a 0 entry in the reorderTable indicates a split lead byte. * The scripts data has a new format. * * The rootElements may contain secondary and tertiary weights below common=05. * (Used for small Hiragana letters.) * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. * There are no other data structure changes, but builder code needs to be able to handle such data. * * The collation element for the merge separator code point U+FFFE * does not necessarily have special, unique secondary/tertiary weights any more. */ U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONDATAREADER_H__