// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************** * * Copyright (C) 1997-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************** */ #ifndef CHARITER_H #define CHARITER_H #include "unicode/utypes.h" #if U_SHOW_CPLUSPLUS_API #include "unicode/uobject.h" #include "unicode/unistr.h" /** * \file * \brief C++ API: Character Iterator */ U_NAMESPACE_BEGIN /** * Abstract class that defines an API for forward-only iteration * on text objects. * This is a minimal interface for iteration without random access * or backwards iteration. It is especially useful for wrapping * streams with converters into an object for collation or * normalization. * * <p>Characters can be accessed in two ways: as code units or as * code points. * Unicode code points are 21-bit integers and are the scalar values * of Unicode characters. ICU uses the type UChar32 for them. * Unicode code units are the storage units of a given * Unicode/UCS Transformation Format (a character encoding scheme). * With UTF-16, all code points can be represented with either one * or two code units ("surrogates"). * String storage is typically based on code units, while properties * of characters are typically determined using code point values. * Some processes may be designed to work with sequences of code units, * or it may be known that all characters that are important to an * algorithm can be represented with single code units. * Other processes will need to use the code point access functions.</p> * * <p>ForwardCharacterIterator provides nextPostInc() to access * a code unit and advance an internal position into the text object, * similar to a <code>return text[position++]</code>.<br> * It provides next32PostInc() to access a code point and advance an internal * position.</p> * * <p>next32PostInc() assumes that the current position is that of * the beginning of a code point, i.e., of its first code unit. * After next32PostInc(), this will be true again. * In general, access to code units and code points in the same * iteration loop should not be mixed. In UTF-16, if the current position * is on a second code unit (Low Surrogate), then only that code unit * is returned even by next32PostInc().</p> * * <p>For iteration with either function, there are two ways to * check for the end of the iteration. When there are no more * characters in the text object: * <ul> * <li>The hasNext() function returns false.</li> * <li>nextPostInc() and next32PostInc() return DONE * when one attempts to read beyond the end of the text object.</li> * </ul> * * Example: * \code * void function1(ForwardCharacterIterator &it) { * UChar32 c; * while(it.hasNext()) { * c=it.next32PostInc(); * // use c * } * } * * void function1(ForwardCharacterIterator &it) { * char16_t c; * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) { * // use c * } * } * \endcode * </p> * * @stable ICU 2.0 */ class U_COMMON_API ForwardCharacterIterator : public UObject { … }; /** * Abstract class that defines an API for iteration * on text objects. * This is an interface for forward and backward iteration * and random access into a text object. * * <p>The API provides backward compatibility to the Java and older ICU * CharacterIterator classes but extends them significantly: * <ol> * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li> * <li>While the old API functions provided forward iteration with * "pre-increment" semantics, the new one also provides functions * with "post-increment" semantics. They are more efficient and should * be the preferred iterator functions for new implementations. * The backward iteration always had "pre-decrement" semantics, which * are efficient.</li> * <li>Just like ForwardCharacterIterator, it provides access to * both code units and code points. Code point access versions are available * for the old and the new iteration semantics.</li> * <li>There are new functions for setting and moving the current position * without returning a character, for efficiency.</li> * </ol> * * See ForwardCharacterIterator for examples for using the new forward iteration * functions. For backward iteration, there is also a hasPrevious() function * that can be used analogously to hasNext(). * The old functions work as before and are shown below.</p> * * <p>Examples for some of the new functions:</p> * * Forward iteration with hasNext(): * \code * void forward1(CharacterIterator &it) { * UChar32 c; * for(it.setToStart(); it.hasNext();) { * c=it.next32PostInc(); * // use c * } * } * \endcode * Forward iteration more similar to loops with the old forward iteration, * showing a way to convert simple for() loops: * \code * void forward2(CharacterIterator &it) { * char16_t c; * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) { * // use c * } * } * \endcode * Backward iteration with setToEnd() and hasPrevious(): * \code * void backward1(CharacterIterator &it) { * UChar32 c; * for(it.setToEnd(); it.hasPrevious();) { * c=it.previous32(); * // use c * } * } * \endcode * Backward iteration with a more traditional for() loop: * \code * void backward2(CharacterIterator &it) { * char16_t c; * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) { * // use c * } * } * \endcode * * Example for random access: * \code * void random(CharacterIterator &it) { * // set to the third code point from the beginning * it.move32(3, CharacterIterator::kStart); * // get a code point from here without moving the position * UChar32 c=it.current32(); * // get the position * int32_t pos=it.getIndex(); * // get the previous code unit * char16_t u=it.previous(); * // move back one more code unit * it.move(-1, CharacterIterator::kCurrent); * // set the position back to where it was * // and read the same code point c and move beyond it * it.setIndex(pos); * if(c!=it.next32PostInc()) { * exit(1); // CharacterIterator inconsistent * } * } * \endcode * * <p>Examples, especially for the old API:</p> * * Function processing characters, in this example simple output * <pre> * \code * void processChar( char16_t c ) * { * cout << " " << c; * } * \endcode * </pre> * Traverse the text from start to finish * <pre> * \code * void traverseForward(CharacterIterator& iter) * { * for(char16_t c = iter.first(); c != CharacterIterator::DONE; c = iter.next()) { * processChar(c); * } * } * \endcode * </pre> * Traverse the text backwards, from end to start * <pre> * \code * void traverseBackward(CharacterIterator& iter) * { * for(char16_t c = iter.last(); c != CharacterIterator::DONE; c = iter.previous()) { * processChar(c); * } * } * \endcode * </pre> * Traverse both forward and backward from a given position in the text. * Calls to notBoundary() in this example represents some additional stopping criteria. * <pre> * \code * void traverseOut(CharacterIterator& iter, int32_t pos) * { * char16_t c; * for (c = iter.setIndex(pos); * c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c)); * c = iter.next()) {} * int32_t end = iter.getIndex(); * for (c = iter.setIndex(pos); * c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c)); * c = iter.previous()) {} * int32_t start = iter.getIndex() + 1; * * cout << "start: " << start << " end: " << end << endl; * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) { * processChar(c); * } * } * \endcode * </pre> * Creating a StringCharacterIterator and calling the test functions * <pre> * \code * void CharacterIterator_Example( void ) * { * cout << endl << "===== CharacterIterator_Example: =====" << endl; * UnicodeString text("Ein kleiner Satz."); * StringCharacterIterator iterator(text); * cout << "----- traverseForward: -----------" << endl; * traverseForward( iterator ); * cout << endl << endl << "----- traverseBackward: ----------" << endl; * traverseBackward( iterator ); * cout << endl << endl << "----- traverseOut: ---------------" << endl; * traverseOut( iterator, 7 ); * cout << endl << endl << "-----" << endl; * } * \endcode * </pre> * * @stable ICU 2.0 */ class U_COMMON_API CharacterIterator : public ForwardCharacterIterator { … }; inline bool ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const { … } inline int32_t CharacterIterator::setToStart() { … } inline int32_t CharacterIterator::setToEnd() { … } inline int32_t CharacterIterator::startIndex() const { … } inline int32_t CharacterIterator::endIndex() const { … } inline int32_t CharacterIterator::getIndex() const { … } inline int32_t CharacterIterator::getLength() const { … } U_NAMESPACE_END #endif /* U_SHOW_CPLUSPLUS_API */ #endif