chromium/third_party/icu/source/i18n/collationfastlatin.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationfastlatin.h
*
* created on: 2013aug09
* created by: Markus W. Scherer
*/

#ifndef __COLLATIONFASTLATIN_H__
#define __COLLATIONFASTLATIN_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

U_NAMESPACE_BEGIN

struct CollationData;
struct CollationSettings;

class U_I18N_API CollationFastLatin /* all static */ {};

/*
 * Format of the CollationFastLatin data table.
 * CollationFastLatin::VERSION = 2.
 *
 * This table contains data for a Latin-text collation fastpath.
 * The data is stored as an array of uint16_t which contains the following parts.
 *
 * uint16_t  -- version & header length
 *   Bits 15..8: version, must match the VERSION
 *         7..0: length of the header
 *
 * uint16_t varTops[header length - 1]
 *   Version 2:
 *   varTops[m] is the highest CollationFastLatin long-primary weight
 *   of supported maxVariable group m
 *   (special reorder group space, punct, symbol, currency).
 *
 *   Version 1:
 *   Each of these values maps the variable top lead byte of a supported maxVariable group
 *   to the highest CollationFastLatin long-primary weight.
 *   The values are stored in ascending order.
 *   Bits 15..7: max fast-Latin long-primary weight (bits 11..3 shifted left by 4 bits)
 *         6..0: regular primary lead byte
 *
 * uint16_t miniCEs[0x1c0]
 *   A mini collation element for each character U+0000..U+017F and U+2000..U+203F.
 *   Each value encodes one or two mini CEs (two are possible if the first one
 *   has a short mini primary and the second one is a secondary CE, i.e., primary == 0),
 *   or points to an expansion or to a contraction table.
 *   U+0000 always has a contraction entry,
 *   so that NUL-termination need not be tested in the fastpath.
 *   If the collation elements for a character or contraction cannot be encoded in this format,
 *   then the BAIL_OUT value is stored.
 *   For details see the comments for the class constants.
 *
 * uint16_t expansions[variable length];
 *   Expansion mini CEs contain an offset relative to just after the miniCEs table.
 *   An expansions contains exactly 2 mini CEs.
 *
 * uint16_t contractions[variable length];
 *   Contraction mini CEs contain an offset relative to just after the miniCEs table.
 *   It points to a list of tuples which map from a contraction suffix character to a result.
 *   First uint16_t of each tuple:
 *     Bits 10..9: Length of the result (1..3), see comments on CONTR_LENGTH_SHIFT.
 *     Bits  8..0: Contraction character, see comments on CONTR_CHAR_MASK.
 *   This is followed by 0, 1, or 2 uint16_t according to the length.
 *   Each list is terminated by an entry with CONTR_CHAR_MASK.
 *   Each list starts with such an entry which also contains the default result
 *   for when there is no contraction match.
 *
 * -----------------
 * Changes for version 2 (ICU 55)
 *
 * Special reorder groups do not necessarily start on whole primary lead bytes any more.
 * Therefore, the varTops data has a new format:
 * Version 1 stored the lead bytes of the highest root primaries for
 * the maxVariable-supported special reorder groups.
 * Now the top 16 bits would need to be stored,
 * and it is simpler to store only the fast-Latin weights.
 */

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __COLLATIONFASTLATIN_H__