chromium/third_party/icu/source/common/unicode/uiter.h

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2002-2011 International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uiter.h
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan18
*   created by: Markus W. Scherer
*/

#ifndef __UITER_H__
#define __UITER_H__

/**
 * \file
 * \brief C API: Unicode Character Iteration
 *
 * @see UCharIterator
 */

#include "unicode/utypes.h"

#if U_SHOW_CPLUSPLUS_API
    U_NAMESPACE_BEGIN

    class CharacterIterator;
    class Replaceable;

    U_NAMESPACE_END
#endif

U_CDECL_BEGIN

struct UCharIterator;
UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */

/**
 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
 * @see UCharIteratorMove
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorOrigin;

/** Constants for UCharIterator. @stable ICU 2.6 */
enum {};


/**
 * Constant for UCharIterator getState() indicating an error or
 * an unknown state.
 * Returned by uiter_getState()/UCharIteratorGetState
 * when an error occurs.
 * Also, some UCharIterator implementations may not be able to return
 * a valid state for each position. This will be clearly documented
 * for each such iterator (none of the public ones here).
 *
 * @stable ICU 2.6
 */
#define UITER_NO_STATE

/**
 * Function type declaration for UCharIterator.getIndex().
 *
 * Gets the current position, or the start or limit of the
 * iteration range.
 *
 * This function may perform slowly for UITER_CURRENT after setState() was called,
 * or for UITER_LENGTH, because an iterator implementation may have to count
 * UChars if the underlying storage is not UTF-16.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param origin get the 0, start, limit, length, or current index
 * @return the requested index, or U_SENTINEL in an error condition
 *
 * @see UCharIteratorOrigin
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorGetIndex;

/**
 * Function type declaration for UCharIterator.move().
 *
 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
 *
 * Moves the current position relative to the start or limit of the
 * iteration range, or relative to the current position itself.
 * The movement is expressed in numbers of code units forward
 * or backward by specifying a positive or negative delta.
 * Out of bounds movement will be pinned to the start or limit.
 *
 * This function may perform slowly for moving relative to UITER_LENGTH
 * because an iterator implementation may have to count the rest of the
 * UChars if the native storage is not UTF-16.
 *
 * When moving relative to the limit or length, or
 * relative to the current position after setState() was called,
 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
 * determination of the actual UTF-16 index.
 * The actual index can be determined with getIndex(UITER_CURRENT)
 * which will count the UChars if necessary.
 * See UITER_UNKNOWN_INDEX for details.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param delta can be positive, zero, or negative
 * @param origin move relative to the 0, start, limit, length, or current index
 * @return the new index, or U_SENTINEL on an error condition,
 *         or UITER_UNKNOWN_INDEX when the index is not known.
 *
 * @see UCharIteratorOrigin
 * @see UCharIterator
 * @see UITER_UNKNOWN_INDEX
 * @stable ICU 2.1
 */
UCharIteratorMove;

/**
 * Function type declaration for UCharIterator.hasNext().
 *
 * Check if current() and next() can still
 * return another code unit.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return boolean value for whether current() and next() can still return another code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorHasNext;

/**
 * Function type declaration for UCharIterator.hasPrevious().
 *
 * Check if previous() can still return another code unit.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return boolean value for whether previous() can still return another code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorHasPrevious;
 
/**
 * Function type declaration for UCharIterator.current().
 *
 * Return the code unit at the current position,
 * or U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorCurrent;

/**
 * Function type declaration for UCharIterator.next().
 *
 * Return the code unit at the current index and increment
 * the index (post-increment, like s[i++]),
 * or return U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code unit (and post-increment the current index)
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorNext;

/**
 * Function type declaration for UCharIterator.previous().
 *
 * Decrement the index and return the code unit from there
 * (pre-decrement, like s[--i]),
 * or return U_SENTINEL if there is none (index is at the start).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the previous code unit (after pre-decrementing the current index)
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorPrevious;

/**
 * Function type declaration for UCharIterator.reservedFn().
 * Reserved for future use.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param something some integer argument
 * @return some integer
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
UCharIteratorReserved;

/**
 * Function type declaration for UCharIterator.getState().
 *
 * Get the "state" of the iterator in the form of a single 32-bit word.
 * It is recommended that the state value be calculated to be as small as
 * is feasible. For strings with limited lengths, fewer than 32 bits may
 * be sufficient.
 *
 * This is used together with setState()/UCharIteratorSetState
 * to save and restore the iterator position more efficiently than with
 * getIndex()/move().
 *
 * The iterator state is defined as a uint32_t value because it is designed
 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
 * of the character iterator.
 *
 * With some UCharIterator implementations (e.g., UTF-8),
 * getting and setting the UTF-16 index with existing functions
 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
 * relatively slow because the iterator has to "walk" from a known index
 * to the requested one.
 * This takes more time the farther it needs to go.
 *
 * An opaque state value allows an iterator implementation to provide
 * an internal index (UTF-8: the source byte array index) for
 * fast, constant-time restoration.
 *
 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
 * the UTF-16 index may not be restored as well, but the iterator can deliver
 * the correct text contents and move relative to the current position
 * without performance degradation.
 *
 * Some UCharIterator implementations may not be able to return
 * a valid state for each position, in which case they return UITER_NO_STATE instead.
 * This will be clearly documented for each such iterator (none of the public ones here).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the state word
 *
 * @see UCharIterator
 * @see UCharIteratorSetState
 * @see UITER_NO_STATE
 * @stable ICU 2.6
 */
UCharIteratorGetState;

/**
 * Function type declaration for UCharIterator.setState().
 *
 * Restore the "state" of the iterator using a state word from a getState() call.
 * The iterator object need not be the same one as for which getState() was called,
 * but it must be of the same type (set up using the same uiter_setXYZ function)
 * and it must iterate over the same string
 * (binary identical regardless of memory address).
 * For more about the state word see UCharIteratorGetState.
 *
 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
 * the UTF-16 index may not be restored as well, but the iterator can deliver
 * the correct text contents and move relative to the current position
 * without performance degradation.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param state the state word from a getState() call
 *              on a same-type, same-string iterator
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see UCharIterator
 * @see UCharIteratorGetState
 * @stable ICU 2.6
 */
UCharIteratorSetState;


/**
 * C API for code unit iteration.
 * This can be used as a C wrapper around
 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
 *
 * There are two roles for using UCharIterator:
 *
 * A "provider" sets the necessary function pointers and controls the "protected"
 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
 *
 * Implementations of such C APIs are "callers" of UCharIterator functions;
 * they only use the "public" function pointers and never access the "protected"
 * fields directly.
 *
 * The current() and next() functions only check the current index against the
 * limit, and previous() only checks the current index against the start,
 * to see if the iterator already reached the end of the iteration range.
 *
 * The assumption - in all iterators - is that the index is moved via the API,
 * which means it won't go out of bounds, or the index is modified by
 * user code that knows enough about the iterator implementation to set valid
 * index values.
 *
 * UCharIterator functions return code unit values 0..0xffff,
 * or U_SENTINEL if the iteration bounds are reached.
 *
 * @stable ICU 2.1
 */
struct UCharIterator {};

/**
 * Helper function for UCharIterator to get the code point
 * at the current index.
 *
 * Return the code point that includes the code unit at the current position,
 * or U_SENTINEL if there is none (index is at the limit).
 * If the current code unit is a lead or trail surrogate,
 * then the following or preceding surrogate is used to form
 * the code point value.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code point
 *
 * @see UCharIterator
 * @see U16_GET
 * @see UnicodeString::char32At()
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator *iter);

/**
 * Helper function for UCharIterator to get the next code point.
 *
 * Return the code point at the current index and increment
 * the index (post-increment, like s[i++]),
 * or return U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code point (and post-increment the current index)
 *
 * @see UCharIterator
 * @see U16_NEXT
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator *iter);

/**
 * Helper function for UCharIterator to get the previous code point.
 *
 * Decrement the index and return the code point from there
 * (pre-decrement, like s[--i]),
 * or return U_SENTINEL if there is none (index is at the start).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the previous code point (after pre-decrementing the current index)
 *
 * @see UCharIterator
 * @see U16_PREV
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator *iter);

/**
 * Get the "state" of the iterator in the form of a single 32-bit word.
 * This is a convenience function that calls iter->getState(iter)
 * if iter->getState is not NULL;
 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
 *
 * Some UCharIterator implementations may not be able to return
 * a valid state for each position, in which case they return UITER_NO_STATE instead.
 * This will be clearly documented for each such iterator (none of the public ones here).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the state word
 *
 * @see UCharIterator
 * @see UCharIteratorGetState
 * @see UITER_NO_STATE
 * @stable ICU 2.6
 */
U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator *iter);

/**
 * Restore the "state" of the iterator using a state word from a getState() call.
 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param state the state word from a getState() call
 *              on a same-type, same-string iterator
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see UCharIterator
 * @see UCharIteratorSetState
 * @stable ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);

/**
 * Set up a UCharIterator to iterate over a string.
 *
 * Sets the UCharIterator function pointers for iteration over the string s
 * with iteration boundaries start=index=0 and length=limit=string length.
 * The "provider" may set the start, index, and limit values at any time
 * within the range 0..length.
 * The length field will be ignored.
 *
 * The string pointer s is set into UCharIterator.context without copying
 * or reallocating the string contents.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s String to iterate over
 * @param length Length of s, or -1 if NUL-terminated
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);

/**
 * Set up a UCharIterator to iterate over a UTF-16BE string
 * (byte vector with a big-endian pair of bytes per UChar).
 *
 * Everything works just like with a normal UChar iterator (uiter_setString),
 * except that UChars are assembled from byte pairs,
 * and that the length argument here indicates an even number of bytes.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s UTF-16BE string to iterate over
 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
 *               (NUL means pair of 0 bytes at even index from s)
 *
 * @see UCharIterator
 * @see uiter_setString
 * @stable ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);

/**
 * Set up a UCharIterator to iterate over a UTF-8 string.
 *
 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
 * with UTF-8 iteration boundaries 0 and length.
 * The implementation counts the UTF-16 index on the fly and
 * lazily evaluates the UTF-16 length of the text.
 *
 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
 * When the reservedField is not 0, then it contains a supplementary code point
 * and the UTF-16 index is between the two corresponding surrogates.
 * At that point, the UTF-8 index is behind that code point.
 *
 * The UTF-8 string pointer s is set into UCharIterator.context without copying
 * or reallocating the string contents.
 *
 * getState() returns a state value consisting of
 * - the current UTF-8 source byte index (bits 31..1)
 * - a flag (bit 0) that indicates whether the UChar position is in the middle
 *   of a surrogate pair
 *   (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
 *
 * getState() cannot also encode the UTF-16 index in the state value.
 * move(relative to limit or length), or
 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s UTF-8 string to iterate over
 * @param length Length of s in bytes, or -1 if NUL-terminated
 *
 * @see UCharIterator
 * @stable ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);

#if U_SHOW_CPLUSPLUS_API

/**
 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
 *
 * Sets the UCharIterator function pointers for iteration using the
 * CharacterIterator charIter.
 *
 * The CharacterIterator pointer charIter is set into UCharIterator.context
 * without copying or cloning the CharacterIterator object.
 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
 * The iteration index and boundaries are controlled by the CharacterIterator.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param charIter CharacterIterator to wrap
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);

/**
 * Set up a UCharIterator to iterate over a C++ Replaceable.
 *
 * Sets the UCharIterator function pointers for iteration over the
 * Replaceable rep with iteration boundaries start=index=0 and
 * length=limit=rep->length().
 * The "provider" may set the start, index, and limit values at any time
 * within the range 0..length=rep->length().
 * The length field will be ignored.
 *
 * The Replaceable pointer rep is set into UCharIterator.context without copying
 * or cloning/reallocating the Replaceable object.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param rep Replaceable to iterate over
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);

#endif

U_CDECL_END

#endif