// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2007-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: unisetspan.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2007mar01 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #include "unicode/uniset.h" #include "unicode/ustring.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "cmemory.h" #include "uvector.h" #include "unisetspan.h" U_NAMESPACE_BEGIN /* * List of offsets from the current position from where to try matching * a code point or a string. * Store offsets rather than indexes to simplify the code and use the same list * for both increments (in span()) and decrements (in spanBack()). * * Assumption: The maximum offset is limited, and the offsets that are stored * at any one time are relatively dense, that is, there are normally no gaps of * hundreds or thousands of offset values. * * The implementation uses a circular buffer of byte flags, * each indicating whether the corresponding offset is in the list. * This avoids inserting into a sorted list of offsets (or absolute indexes) and * physically moving part of the list. * * Note: In principle, the caller should setMaxLength() to the maximum of the * max string length and U16_LENGTH/U8_LENGTH to account for * "long" single code points. * However, this implementation uses at least a staticList with more than * U8_LENGTH entries anyway. * * Note: If maxLength were guaranteed to be no more than 32 or 64, * the list could be stored as bit flags in a single integer. * Rather than handling a circular buffer with a start list index, * the integer would simply be shifted when lower offsets are removed. * UnicodeSet does not have a limit on the lengths of strings. */ class OffsetList { … }; // Get the number of UTF-8 bytes for a UTF-16 (sub)string. static int32_t getUTF8Length(const char16_t *s, int32_t length) { … } // Append the UTF-8 version of the string to t and return the appended UTF-8 length. static int32_t appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) { … } static inline uint8_t makeSpanLengthByte(int32_t spanLength) { … } // Construct for all variants of span(), or only for any one variant. // Initialize as little as possible, for single use. UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which) : … { … } // Copy constructor. Assumes which==ALL for a frozen set. UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings) : … { … } UnicodeSetStringSpan::~UnicodeSetStringSpan() { … } void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { … } // Compare strings without any argument checks. Requires length>0. static inline UBool matches16(const char16_t *s, const char16_t *t, int32_t length) { … } static inline UBool matches8(const uint8_t *s, const uint8_t *t, int32_t length) { … } // Compare 16-bit Unicode strings (which may be malformed UTF-16) // at code point boundaries. // That is, each edge of a match must not be in the middle of a surrogate pair. static inline UBool matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) { … } // Does the set contain the next code point? // If so, return its length; otherwise return its negative length. static inline int32_t spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) { … } static inline int32_t spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) { … } static inline int32_t spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { … } static inline int32_t spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { … } /* * Note: In span() when spanLength==0 (after a string match, or at the beginning * after an empty code point span) and in spanNot() and spanNotUTF8(), * string matching could use a binary search * because all string matches are done from the same start index. * * For UTF-8, this would require a comparison function that returns UTF-16 order. * * This optimization should not be necessary for normal UnicodeSets because * most sets have no strings, and most sets with strings have * very few very short strings. * For cases with many strings, it might be better to use a different API * and implementation with a DFA (state machine). */ /* * Algorithm for span(USET_SPAN_CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Either recursively span for each code point or string match, * or recursively span for all but the shortest one and * iteratively continue the span with the shortest local match. * + Remember the longest recursive span (the farthest end point). * + If there is no match at the current position, neither for the code point there * nor for any set string, then stop and return the longest recursive span length. * * Optimized implementation: * * (We assume that most sets will have very few very short strings. * A span using a string-less set is extremely fast.) * * Create and cache a spanSet which contains all of the single code points * of the original set but none of its strings. * * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points must be matched * with a partial overlap because the recursive algorithm would have tried * to match them at every position. * ~ Set strings that entirely consist of set-contained code points * are irrelevant for span(USET_SPAN_CONTAINED) because the * recursive algorithm would continue after them anyway * and find the longest recursive match from their end. * ~ Rather than recursing, note each end point of a set string match. * + If no set string matched after spanSet.span(), then return * with where the spanSet.span() ended. * + If at least one set string matched after spanSet.span(), then * pop the shortest string match end point and continue * the loop, trying to match all set strings from there. * + If at least one more set string matched after a previous string match, * then test if the code point after the previous string match is also * contained in the set. * Continue the loop with the shortest end point of either this code point * or a matching set string. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). * Stop if spanLength==0, otherwise continue the loop. * * By noting each end point of a set string match, * the function visits each string position at most once and finishes * in linear time. * * The recursive algorithm may visit the same string position many times * if multiple paths lead to it and finishes in exponential time. */ /* * Algorithm for span(USET_SPAN_SIMPLE) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Continue from the farthest match position and ignore all others. * + If there is no match at the current position, * then stop and return the current position. * * Optimized implementation: * * (Same assumption and spanSet as above.) * * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points must be matched * with a partial overlap because the standard algorithm would have tried * to match them earlier. * ~ Set strings that entirely consist of set-contained code points * must be matched with a full overlap because the longest-match algorithm * would hide set string matches that end earlier. * Such set strings need not be matched earlier inside the code point span * because the standard algorithm would then have continued after * the set string match anyway. * ~ Remember the longest set string match (farthest end point) from the earliest * starting point. * + If no set string matched after spanSet.span(), then return * with where the spanSet.span() ended. * + If at least one set string matched, then continue the loop after the * longest match from the earliest position. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). * Stop if spanLength==0, otherwise continue the loop. */ int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { … } int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { … } int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { … } int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { … } /* * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then return with the current position. * + If a set string matches at the current position, then return with the current position. * * Optimized implementation: * * (Same assumption as for span() above.) * * Create and cache a spanNotSet which contains all of the single code points * of the original set but none of its strings. * For each set string add its initial code point to the spanNotSet. * (Also add its final code point for spanNotBack().) * * - Loop: * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). * + If the current code point is in the original set, then * return the current position. * + If any set string matches at the current position, then * return the current position. * + If there is no match at the current position, neither for the code point there * nor for any set string, then skip this code point and continue the loop. * This happens for set-string-initial code points that were added to spanNotSet * when there is not actually a match for such a set string. */ int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const { … } int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const { … } int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { … } int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { … } U_NAMESPACE_END