normlzr.h | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 ********************************************************************
 * COPYRIGHT:
 * Copyright (c) 1996-2015, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************
 */

#ifndef NORMLZR_H
#define NORMLZR_H

#include "unicode/utypes.h"

#if U_SHOW_CPLUSPLUS_API

/**
 * \file 
 * \brief C++ API: Unicode Normalization
 */
 
#if !UCONFIG_NO_NORMALIZATION

#include "unicode/chariter.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "unicode/uobject.h"

U_NAMESPACE_BEGIN
/**
 * Old Unicode normalization API.
 *
 * This API has been replaced by the Normalizer2 class and is only available
 * for backward compatibility. This class simply delegates to the Normalizer2 class.
 * There is one exception: The new API does not provide a replacement for Normalizer::compare().
 *
 * The Normalizer class supports the standard normalization forms described in
 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
 * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
 *
 * The Normalizer class consists of two parts:
 * - static functions that normalize strings or test if strings are normalized
 * - a Normalizer object is an iterator that takes any kind of text and
 *   provides iteration over its normalized form
 *
 * The Normalizer class is not suitable for subclassing.
 *
 * For basic information about normalization forms and details about the C API
 * please see the documentation in unorm.h.
 *
 * The iterator API with the Normalizer constructors and the non-static functions
 * use a CharacterIterator as input. It is possible to pass a string which
 * is then internally wrapped in a CharacterIterator.
 * The input text is not normalized all at once, but incrementally where needed
 * (providing efficient random access).
 * This allows to pass in a large text but spend only a small amount of time
 * normalizing a small part of that text.
 * However, if the entire text is normalized, then the iterator will be
 * slower than normalizing the entire text at once and iterating over the result.
 * A possible use of the Normalizer iterator is also to report an index into the
 * original text that is close to where the normalized characters come from.
 *
 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
 * The earlier implementation reported the getIndex() inconsistently,
 * and previous() could not be used after setIndex(), next(), first(), and current().
 *
 * Normalizer allows to start normalizing from anywhere in the input text by
 * calling setIndexOnly(), first(), or last().
 * Without calling any of these, the iterator will start at the beginning of the text.
 *
 * At any time, next() returns the next normalized code point (UChar32),
 * with post-increment semantics (like CharacterIterator::next32PostInc()).
 * previous() returns the previous normalized code point (UChar32),
 * with pre-decrement semantics (like CharacterIterator::previous32()).
 *
 * current() returns the current code point
 * (respectively the one at the newly set index) without moving
 * the getIndex(). Note that if the text at the current position
 * needs to be normalized, then these functions will do that.
 * (This is why current() is not const.)
 * It is more efficient to call setIndexOnly() instead, which does not
 * normalize.
 *
 * getIndex() always refers to the position in the input text where the normalized
 * code points are returned from. It does not always change with each returned
 * code point.
 * The code point that is returned from any of the functions
 * corresponds to text at or after getIndex(), according to the
 * function's iteration semantics (post-increment or pre-decrement).
 *
 * next() returns a code point from at or after the getIndex()
 * from before the next() call. After the next() call, the getIndex()
 * might have moved to where the next code point will be returned from
 * (from a next() or current() call).
 * This is semantically equivalent to array access with array[index++]
 * (post-increment semantics).
 *
 * previous() returns a code point from at or after the getIndex()
 * from after the previous() call.
 * This is semantically equivalent to array access with array[--index]
 * (pre-decrement semantics).
 *
 * Internally, the Normalizer iterator normalizes a small piece of text
 * starting at the getIndex() and ending at a following "safe" index.
 * The normalized results is stored in an internal string buffer, and
 * the code points are iterated from there.
 * With multiple iteration calls, this is repeated until the next piece
 * of text needs to be normalized, and the getIndex() needs to be moved.
 *
 * The following "safe" index, the internal buffer, and the secondary
 * iteration index into that buffer are not exposed on the API.
 * This also means that it is currently not practical to return to
 * a particular, arbitrary position in the text because one would need to
 * know, and be able to set, in addition to the getIndex(), at least also the
 * current index into the internal buffer.
 * It is currently only possible to observe when getIndex() changes
 * (with careful consideration of the iteration semantics),
 * at which time the internal index will be 0.
 * For example, if getIndex() is different after next() than before it,
 * then the internal index is 0 and one can return to this getIndex()
 * later with setIndexOnly().
 *
 * Note: While the setIndex() and getIndex() refer to indices in the
 * underlying Unicode input text, the next() and previous() methods
 * iterate through characters in the normalized output.
 * This means that there is not necessarily a one-to-one correspondence
 * between characters returned by next() and previous() and the indices
 * passed to and returned from setIndex() and getIndex().
 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
 *
 * @author Laura Werner, Mark Davis, Markus Scherer
 * @stable ICU 2.0
 */
class U_COMMON_API Normalizer : public UObject { … };

//-------------------------------------------------------------------------
// Inline implementations
//-------------------------------------------------------------------------

#ifndef U_HIDE_DEPRECATED_API
inline bool
Normalizer::operator!= (const Normalizer& other) const
{ … }

inline UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
                       UNormalizationMode mode,
                       UErrorCode &status) { … }

inline UBool
Normalizer::isNormalized(const UnicodeString& source,
                         UNormalizationMode mode,
                         UErrorCode &status) { … }
#endif  /* U_HIDE_DEPRECATED_API */

inline int32_t
Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
                    uint32_t options,
                    UErrorCode &errorCode) { … }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif // NORMLZR_H

#endif /* U_SHOW_CPLUSPLUS_API */
godot/thirdparty/icu4c/common/unicode/normlzr.h