// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2014 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 03/22/2000 helena Creation. ********************************************************************** */ #ifndef STSEARCH_H #define STSEARCH_H #include "unicode/utypes.h" #if U_SHOW_CPLUSPLUS_API /** * \file * \brief C++ API: Service for searching text based on RuleBasedCollator. */ #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/search.h" U_NAMESPACE_BEGIN /** * * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides * language-sensitive text searching based on the comparison rules defined * in a {@link RuleBasedCollator} object. * StringSearch ensures that language eccentricity can be * handled, e.g. for the German collator, characters ß and SS will be matched * if case is chosen to be ignored. * See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm"> * "ICU Collation Design Document"</a> for more information. * <p> * There are 2 match options for selection:<br> * Let S' be the sub-string of a text string S between the offsets start and * end [start, end]. * <br> * A pattern string P matches a text string S at the offsets [start, end] * if * <pre> * option 1. Some canonical equivalent of P matches some canonical equivalent * of S' * option 2. P matches S' and if P starts or ends with a combining mark, * there exists no non-ignorable combining mark before or after S? * in S respectively. * </pre> * Option 2. will be the default. * <p> * This search has APIs similar to that of other text iteration mechanisms * such as the break iterators in <tt>BreakIterator</tt>. Using these * APIs, it is easy to scan through text looking for all occurrences of * a given pattern. This search iterator allows changing of direction by * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. * Though a direction change can occur without calling <tt>reset</tt> first, * this operation comes with some speed penalty. * Match results in the forward direction will match the result matches in * the backwards direction in the reverse order * <p> * <tt>SearchIterator</tt> provides APIs to specify the starting position * within the text string to be searched, e.g. <tt>setOffset</tt>, * <tt>preceding</tt> and <tt>following</tt>. Since the * starting position will be set as it is specified, please take note that * there are some danger points which the search may render incorrect * results: * <ul> * <li> The midst of a substring that requires normalization. * <li> If the following match is to be found, the position should not be the * second character which requires to be swapped with the preceding * character. Vice versa, if the preceding match is to be found, * position to search from should not be the first character which * requires to be swapped with the next character. E.g certain Thai and * Lao characters require swapping. * <li> If a following pattern match is to be found, any position within a * contracting sequence except the first will fail. Vice versa if a * preceding pattern match is to be found, a invalid starting point * would be any character within a contracting sequence except the last. * </ul> * <p> * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired. * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the * boundaries given by the breakiterator. For instance the pattern "e" will * not be found in the string "\u00e9" if a character break iterator is used. * <p> * Options are provided to handle overlapping matches. * E.g. In English, overlapping matches produces the result 0 and 2 * for the pattern "abab" in the text "ababab", where else mutually * exclusive matches only produce the result of 0. * <p> * Though collator attributes will be taken into consideration while * performing matches, there are no APIs here for setting and getting the * attributes. These attributes can be set by getting the collator * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>. * Lastly to update <tt>StringSearch</tt> to the new collator attributes, * <tt>reset</tt> has to be called. * <p> * Restriction: <br> * Currently there are no composite characters that consists of a * character with combining class > 0 before a character with combining * class == 0. However, if such a character exists in the future, * <tt>StringSearch</tt> does not guarantee the results for option 1. * <p> * Consult the <tt>SearchIterator</tt> documentation for information on * and examples of how to use instances of this class to implement text * searching. * <pre><code> * UnicodeString target("The quick brown fox jumps over the lazy dog."); * UnicodeString pattern("fox"); * * UErrorCode error = U_ZERO_ERROR; * StringSearch iter(pattern, target, Locale::getUS(), nullptr, status); * for (int pos = iter.first(error); * pos != USEARCH_DONE; * pos = iter.next(error)) * { * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); * } * </code></pre> * <p> * Note, <tt>StringSearch</tt> is not to be subclassed. * </p> * @see SearchIterator * @see RuleBasedCollator * @since ICU 2.0 */ class U_I18N_API StringSearch final : public SearchIterator { … }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_COLLATION */ #endif /* U_SHOW_CPLUSPLUS_API */ #endif