// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2015 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 07/02/2001 synwee Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/usearch.h" #include "unicode/ustring.h" #include "unicode/uchar.h" #include "unicode/utf16.h" #include "normalizer2impl.h" #include "usrchimp.h" #include "cmemory.h" #include "ucln_in.h" #include "uassert.h" #include "ustr_imp.h" U_NAMESPACE_USE // internal definition --------------------------------------------------- #define LAST_BYTE_MASK_ … #define SECOND_LAST_BYTE_SHIFT_ … #define SUPPLEMENTARY_MIN_VALUE_ … static const Normalizer2Impl *g_nfcImpl = …; // internal methods ------------------------------------------------- /** * Fast collation element iterator setOffset. * This function does not check for bounds. * @param coleiter collation element iterator * @param offset to set */ static inline void setColEIterOffset(UCollationElements *elems, int32_t offset, UErrorCode &status) { … } /** * Getting the mask for collation strength * @param strength collation strength * @return collation element mask */ static inline uint32_t getMask(UCollationStrength strength) { … } U_CDECL_BEGIN static UBool U_CALLCONV usearch_cleanup() { … } U_CDECL_END /** * Initializing the fcd tables. * Internal method, status assumed to be a success. * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. */ static inline void initializeFCD(UErrorCode *status) { … } /** * Gets the fcd value for a character at the argument index. * This method takes into accounts of the supplementary characters. * @param str UTF16 string where character for fcd retrieval resides * @param offset position of the character whose fcd is to be retrieved, to be * overwritten with the next character position, taking * surrogate characters into consideration. * @param strlength length of the argument string * @return fcd value */ static uint16_t getFCD(const char16_t *str, int32_t *offset, int32_t strlength) { … } /** * Getting the modified collation elements taking into account the collation * attributes * @param strsrch string search data * @param sourcece * @return the modified collation element */ static inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) { … } /** * Allocate a memory and returns nullptr if it failed. * Internal method, status assumed to be a success. * @param size to allocate * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. * @return newly allocated array, nullptr otherwise */ static inline void * allocateMemory(uint32_t size, UErrorCode *status) { … } /** * Adds a uint32_t value to a destination array. * Creates a new array if we run out of space. The caller will have to * manually deallocate the newly allocated array. * Internal method, status assumed to be success, caller has to check status * before calling this method. destination not to be nullptr and has at least * size destinationlength. * @param destination target array * @param offset destination offset to add value * @param destinationlength target array size, return value for the new size * @param value to be added * @param increments incremental size expected * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. * @return new destination array, destination if there was no new allocation */ static inline int32_t * addTouint32_tArray(int32_t *destination, uint32_t offset, uint32_t *destinationlength, uint32_t value, uint32_t increments, UErrorCode *status) { … } /** * Adds a uint64_t value to a destination array. * Creates a new array if we run out of space. The caller will have to * manually deallocate the newly allocated array. * Internal method, status assumed to be success, caller has to check status * before calling this method. destination not to be nullptr and has at least * size destinationlength. * @param destination target array * @param offset destination offset to add value * @param destinationlength target array size, return value for the new size * @param value to be added * @param increments incremental size expected * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. * @return new destination array, destination if there was no new allocation */ static inline int64_t * addTouint64_tArray(int64_t *destination, uint32_t offset, uint32_t *destinationlength, uint64_t value, uint32_t increments, UErrorCode *status) { … } /** * Initializing the ce table for a pattern. * Stores non-ignorable collation keys. * Table size will be estimated by the size of the pattern text. Table * expansion will be perform as we go along. Adding 1 to ensure that the table * size definitely increases. * Internal method, status assumed to be a success. * @param strsrch string search data * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. */ static inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status) { … } /** * Initializing the pce table for a pattern. * Stores non-ignorable collation keys. * Table size will be estimated by the size of the pattern text. Table * expansion will be perform as we go along. Adding 1 to ensure that the table * size definitely increases. * Internal method, status assumed to be a success. * @param strsrch string search data * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. */ static inline void initializePatternPCETable(UStringSearch *strsrch, UErrorCode *status) { … } /** * Initializes the pattern struct. * @param strsrch UStringSearch data storage * @param status output error if any, caller to check status before calling * method, status assumed to be success when passed in. */ static inline void initializePattern(UStringSearch *strsrch, UErrorCode *status) { … } /** * Initializes the pattern struct and builds the pattern collation element table. * @param strsrch UStringSearch data storage * @param status for output errors if it occurs, status is assumed to be a * success when it is passed in. */ static inline void initialize(UStringSearch *strsrch, UErrorCode *status) { … } #if !UCONFIG_NO_BREAK_ITERATION // If the caller provided a character breakiterator we'll return that, // otherwise we lazily create the internal break iterator. static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status) { … } #endif /** * Sets the match result to "not found", regardless of the incoming error status. * If an error occurs while setting the result, it is reported back. * * @param strsrch string search data * @param status for output errors, if they occur. */ static inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status) { … } /** * Checks if the offset runs out of the text string * @param offset * @param textlength of the text string * @return true if offset is out of bounds, false otherwise */ static inline UBool isOutOfBounds(int32_t textlength, int32_t offset) { … } /** * Checks for identical match * @param strsrch string search data * @param start offset of possible match * @param end offset of possible match * @return true if identical match is found */ static inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end) { … } // constructors and destructor ------------------------------------------- U_CAPI UStringSearch * U_EXPORT2 usearch_open(const char16_t *pattern, int32_t patternlength, const char16_t *text, int32_t textlength, const char *locale, UBreakIterator *breakiter, UErrorCode *status) { … } U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( const char16_t *pattern, int32_t patternlength, const char16_t *text, int32_t textlength, const UCollator *collator, UBreakIterator *breakiter, UErrorCode *status) { … } U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) { … } namespace { UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { … } } // set and get methods -------------------------------------------------- U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, int32_t position, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) { … } U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, USearchAttribute attribute, USearchAttributeValue value, UErrorCode *status) { … } U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( const UStringSearch *strsrch, USearchAttribute attribute) { … } U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( const UStringSearch *strsrch) { … } U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, char16_t *result, int32_t resultCapacity, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( const UStringSearch *strsrch) { … } #if !UCONFIG_NO_BREAK_ITERATION U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, UBreakIterator *breakiter, UErrorCode *status) { … } U_CAPI const UBreakIterator* U_EXPORT2 usearch_getBreakIterator(const UStringSearch *strsrch) { … } #endif U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, const char16_t *text, int32_t textlength, UErrorCode *status) { … } U_CAPI const char16_t * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, int32_t *length) { … } U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, const UCollator *collator, UErrorCode *status) { … } U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) { … } U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, const char16_t *pattern, int32_t patternlength, UErrorCode *status) { … } U_CAPI const char16_t* U_EXPORT2 usearch_getPattern(const UStringSearch *strsrch, int32_t *length) { … } // miscellaneous methods -------------------------------------------------- U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, int32_t position, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, int32_t position, UErrorCode *status) { … } /** * If a direction switch is required, we'll count the number of ces till the * beginning of the collation element iterator and iterate forwards that * number of times. This is so that we get to the correct point within the * string to continue the search in. Imagine when we are in the middle of the * normalization buffer when the change in direction is request. arrrgghh.... * After searching the offset within the collation element iterator will be * shifted to the start of the match. If a match is not found, the offset would * have been set to the end of the text string in the collation element * iterator. * Okay, here's my take on normalization buffer. The only time when there can * be 2 matches within the same normalization is when the pattern is consists * of all accents. But since the offset returned is from the text string, we * should not confuse the caller by returning the second match within the * same normalization buffer. If we do, the 2 results will have the same match * offsets, and that'll be confusing. I'll return the next match that doesn't * fall within the same normalization buffer. Note this does not affect the * results of matches spanning the text and the normalization buffer. * The position to start searching is taken from the collation element * iterator. Callers of this API would have to set the offset in the collation * element iterator before using this method. */ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, UErrorCode *status) { … } U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, UErrorCode *status) { … } U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) { … } // // CEI Collation Element + source text index. // These structs are kept in the circular buffer. // struct CEI { … }; U_NAMESPACE_BEGIN namespace { // // CEIBuffer A circular buffer of CEs-with-index from the text being searched. // #define DEFAULT_CEBUFFER_SIZE … #define CEBUFFER_EXTRA … // Some typical max values to make buffer size more reasonable for asymmetric search. // #8694 is for a better long-term solution to allocation of this buffer. #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L … #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER … #define MIGHT_BE_JAMO_L(c) … struct CEIBuffer { … }; CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { … } // TODO: add a reset or init function so that allocated // buffers can be retained & reused. CEIBuffer::~CEIBuffer() { … } // Get the CE with the specified index. // Index must be in the range // n-history_size < index < n+1 // where n is the largest index to have been fetched by some previous call to this function. // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. // const CEI *CEIBuffer::get(int32_t index) { … } // Get the CE with the specified index. // Index must be in the range // n-history_size < index < n+1 // where n is the largest index to have been fetched by some previous call to this function. // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. // const CEI *CEIBuffer::getPrevious(int32_t index) { … } } U_NAMESPACE_END // #define USEARCH_DEBUG #ifdef USEARCH_DEBUG #include <stdio.h> #include <stdlib.h> #endif /* * Find the next break boundary after startIndex. If the UStringSearch object * has an external break iterator, use that. Otherwise use the internal character * break iterator. */ static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) { … } /* * Returns true if index is on a break boundary. If the UStringSearch * has an external break iterator, test using that, otherwise test * using the internal character break iterator. */ static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) { … } #if 0 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status) { if (U_FAILURE(status)) { return true; } #if !UCONFIG_NO_BREAK_ITERATION UBreakIterator *breakiterator = getBreakIterator(strsrch, status); if (U_SUCCESS(status)) { int32_t startindex = ubrk_first(breakiterator); int32_t endindex = ubrk_last(breakiterator); // out-of-range indexes are never boundary positions if (start < startindex || start > endindex || end < startindex || end > endindex) { return false; } return ubrk_isBoundary(breakiterator, start) && ubrk_isBoundary(breakiterator, end); } #endif return true; } #endif UCompareCEsResult; #define U_CE_LEVEL2_BASE … #define U_CE_LEVEL3_BASE … static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { … } namespace { UChar32 codePointAt(const USearch &search, int32_t index) { … } UChar32 codePointBefore(const USearch &search, int32_t index) { … } } // namespace U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, int32_t startIdx, int32_t *matchStart, int32_t *matchLimit, UErrorCode *status) { … } U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, int32_t startIdx, int32_t *matchStart, int32_t *matchLimit, UErrorCode *status) { … } // internal use methods declared in usrchimp.h ----------------------------- UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) { … } UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) { … } UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) { … } UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, UErrorCode *status) { … } #endif /* #if !UCONFIG_NO_COLLATION */