// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************** * COPYRIGHT: * Copyright (c) 1996-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** */ #ifndef NORMLZR_H #define NORMLZR_H #include "unicode/utypes.h" #if U_SHOW_CPLUSPLUS_API /** * \file * \brief C++ API: Unicode Normalization */ #if !UCONFIG_NO_NORMALIZATION #include "unicode/chariter.h" #include "unicode/normalizer2.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "unicode/uobject.h" U_NAMESPACE_BEGIN /** * Old Unicode normalization API. * * This API has been replaced by the Normalizer2 class and is only available * for backward compatibility. This class simply delegates to the Normalizer2 class. * There is one exception: The new API does not provide a replacement for Normalizer::compare(). * * The Normalizer class supports the standard normalization forms described in * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> * Unicode Standard Annex #15: Unicode Normalization Forms</a>. * * The Normalizer class consists of two parts: * - static functions that normalize strings or test if strings are normalized * - a Normalizer object is an iterator that takes any kind of text and * provides iteration over its normalized form * * The Normalizer class is not suitable for subclassing. * * For basic information about normalization forms and details about the C API * please see the documentation in unorm.h. * * The iterator API with the Normalizer constructors and the non-static functions * use a CharacterIterator as input. It is possible to pass a string which * is then internally wrapped in a CharacterIterator. * The input text is not normalized all at once, but incrementally where needed * (providing efficient random access). * This allows to pass in a large text but spend only a small amount of time * normalizing a small part of that text. * However, if the entire text is normalized, then the iterator will be * slower than normalizing the entire text at once and iterating over the result. * A possible use of the Normalizer iterator is also to report an index into the * original text that is close to where the normalized characters come from. * * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0. * The earlier implementation reported the getIndex() inconsistently, * and previous() could not be used after setIndex(), next(), first(), and current(). * * Normalizer allows to start normalizing from anywhere in the input text by * calling setIndexOnly(), first(), or last(). * Without calling any of these, the iterator will start at the beginning of the text. * * At any time, next() returns the next normalized code point (UChar32), * with post-increment semantics (like CharacterIterator::next32PostInc()). * previous() returns the previous normalized code point (UChar32), * with pre-decrement semantics (like CharacterIterator::previous32()). * * current() returns the current code point * (respectively the one at the newly set index) without moving * the getIndex(). Note that if the text at the current position * needs to be normalized, then these functions will do that. * (This is why current() is not const.) * It is more efficient to call setIndexOnly() instead, which does not * normalize. * * getIndex() always refers to the position in the input text where the normalized * code points are returned from. It does not always change with each returned * code point. * The code point that is returned from any of the functions * corresponds to text at or after getIndex(), according to the * function's iteration semantics (post-increment or pre-decrement). * * next() returns a code point from at or after the getIndex() * from before the next() call. After the next() call, the getIndex() * might have moved to where the next code point will be returned from * (from a next() or current() call). * This is semantically equivalent to array access with array[index++] * (post-increment semantics). * * previous() returns a code point from at or after the getIndex() * from after the previous() call. * This is semantically equivalent to array access with array[--index] * (pre-decrement semantics). * * Internally, the Normalizer iterator normalizes a small piece of text * starting at the getIndex() and ending at a following "safe" index. * The normalized results is stored in an internal string buffer, and * the code points are iterated from there. * With multiple iteration calls, this is repeated until the next piece * of text needs to be normalized, and the getIndex() needs to be moved. * * The following "safe" index, the internal buffer, and the secondary * iteration index into that buffer are not exposed on the API. * This also means that it is currently not practical to return to * a particular, arbitrary position in the text because one would need to * know, and be able to set, in addition to the getIndex(), at least also the * current index into the internal buffer. * It is currently only possible to observe when getIndex() changes * (with careful consideration of the iteration semantics), * at which time the internal index will be 0. * For example, if getIndex() is different after next() than before it, * then the internal index is 0 and one can return to this getIndex() * later with setIndexOnly(). * * Note: While the setIndex() and getIndex() refer to indices in the * underlying Unicode input text, the next() and previous() methods * iterate through characters in the normalized output. * This means that there is not necessarily a one-to-one correspondence * between characters returned by next() and previous() and the indices * passed to and returned from setIndex() and getIndex(). * It is for this reason that Normalizer does not implement the CharacterIterator interface. * * @author Laura Werner, Mark Davis, Markus Scherer * @stable ICU 2.0 */ class U_COMMON_API Normalizer : public UObject { … }; //------------------------------------------------------------------------- // Inline implementations //------------------------------------------------------------------------- #ifndef U_HIDE_DEPRECATED_API inline bool Normalizer::operator!= (const Normalizer& other) const { … } inline UNormalizationCheckResult Normalizer::quickCheck(const UnicodeString& source, UNormalizationMode mode, UErrorCode &status) { … } inline UBool Normalizer::isNormalized(const UnicodeString& source, UNormalizationMode mode, UErrorCode &status) { … } #endif /* U_HIDE_DEPRECATED_API */ inline int32_t Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, uint32_t options, UErrorCode &errorCode) { … } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_NORMALIZATION */ #endif // NORMLZR_H #endif /* U_SHOW_CPLUSPLUS_API */