usearch.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2001-2015 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  07/02/2001   synwee      Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/usearch.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
#include "usrchimp.h"
#include "cmemory.h"
#include "ucln_in.h"
#include "uassert.h"
#include "ustr_imp.h"

U_NAMESPACE_USE

// internal definition ---------------------------------------------------

#define LAST_BYTE_MASK_ …
#define SECOND_LAST_BYTE_SHIFT_ …
#define SUPPLEMENTARY_MIN_VALUE_ …

static const Normalizer2Impl *g_nfcImpl = …;

// internal methods -------------------------------------------------

/**
* Fast collation element iterator setOffset.
* This function does not check for bounds.
* @param coleiter collation element iterator
* @param offset to set
*/
static
inline void setColEIterOffset(UCollationElements *elems,
                              int32_t offset,
                              UErrorCode &status)
{ … }

/**
* Getting the mask for collation strength
* @param strength collation strength
* @return collation element mask
*/
static
inline uint32_t getMask(UCollationStrength strength)
{ … }

U_CDECL_BEGIN
static UBool U_CALLCONV
usearch_cleanup() { … }
U_CDECL_END

/**
* Initializing the fcd tables.
* Internal method, status assumed to be a success.
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
*/
static
inline void initializeFCD(UErrorCode *status)
{ … }

/**
* Gets the fcd value for a character at the argument index.
* This method takes into accounts of the supplementary characters.
* @param str UTF16 string where character for fcd retrieval resides
* @param offset position of the character whose fcd is to be retrieved, to be
*               overwritten with the next character position, taking
*               surrogate characters into consideration.
* @param strlength length of the argument string
* @return fcd value
*/
static
uint16_t getFCD(const char16_t   *str, int32_t *offset,
                             int32_t  strlength)
{ … }

/**
* Getting the modified collation elements taking into account the collation
* attributes
* @param strsrch string search data
* @param sourcece
* @return the modified collation element
*/
static
inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
{ … }

/**
* Allocate a memory and returns nullptr if it failed.
* Internal method, status assumed to be a success.
* @param size to allocate
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
* @return newly allocated array, nullptr otherwise
*/
static
inline void * allocateMemory(uint32_t size, UErrorCode *status)
{ … }

/**
* Adds a uint32_t value to a destination array.
* Creates a new array if we run out of space. The caller will have to
* manually deallocate the newly allocated array.
* Internal method, status assumed to be success, caller has to check status
* before calling this method. destination not to be nullptr and has at least
* size destinationlength.
* @param destination target array
* @param offset destination offset to add value
* @param destinationlength target array size, return value for the new size
* @param value to be added
* @param increments incremental size expected
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
* @return new destination array, destination if there was no new allocation
*/
static
inline int32_t * addTouint32_tArray(int32_t    *destination,
                                    uint32_t    offset,
                                    uint32_t   *destinationlength,
                                    uint32_t    value,
                                    uint32_t    increments,
                                    UErrorCode *status)
{ … }

/**
* Adds a uint64_t value to a destination array.
* Creates a new array if we run out of space. The caller will have to
* manually deallocate the newly allocated array.
* Internal method, status assumed to be success, caller has to check status
* before calling this method. destination not to be nullptr and has at least
* size destinationlength.
* @param destination target array
* @param offset destination offset to add value
* @param destinationlength target array size, return value for the new size
* @param value to be added
* @param increments incremental size expected
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
* @return new destination array, destination if there was no new allocation
*/
static
inline int64_t * addTouint64_tArray(int64_t    *destination,
                                    uint32_t    offset,
                                    uint32_t   *destinationlength,
                                    uint64_t    value,
                                    uint32_t    increments,
                                    UErrorCode *status)
{ … }

/**
* Initializing the ce table for a pattern.
* Stores non-ignorable collation keys.
* Table size will be estimated by the size of the pattern text. Table
* expansion will be perform as we go along. Adding 1 to ensure that the table
* size definitely increases.
* Internal method, status assumed to be a success.
* @param strsrch string search data
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
*/
static
inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status)
{ … }

/**
* Initializing the pce table for a pattern.
* Stores non-ignorable collation keys.
* Table size will be estimated by the size of the pattern text. Table
* expansion will be perform as we go along. Adding 1 to ensure that the table
* size definitely increases.
* Internal method, status assumed to be a success.
* @param strsrch string search data
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
*/
static
inline void initializePatternPCETable(UStringSearch *strsrch,
                                      UErrorCode    *status)
{ … }

/**
* Initializes the pattern struct.
* @param strsrch UStringSearch data storage
* @param status output error if any, caller to check status before calling
*               method, status assumed to be success when passed in.
*/
static
inline void initializePattern(UStringSearch *strsrch, UErrorCode *status)
{ … }

/**
* Initializes the pattern struct and builds the pattern collation element table.
* @param strsrch UStringSearch data storage
* @param status  for output errors if it occurs, status is assumed to be a
*                success when it is passed in.
*/
static
inline void initialize(UStringSearch *strsrch, UErrorCode *status)
{ … }

#if !UCONFIG_NO_BREAK_ITERATION
// If the caller provided a character breakiterator we'll return that,
// otherwise we lazily create the internal break iterator. 
static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status)
{ … }
#endif

/**
* Sets the match result to "not found", regardless of the incoming error status.
* If an error occurs while setting the result, it is reported back.
* 
* @param strsrch string search data
* @param status  for output errors, if they occur.
*/
static
inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status)
{ … }

/**
* Checks if the offset runs out of the text string
* @param offset
* @param textlength of the text string
* @return true if offset is out of bounds, false otherwise
*/
static
inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
{ … }

/**
* Checks for identical match
* @param strsrch string search data
* @param start offset of possible match
* @param end offset of possible match
* @return true if identical match is found
*/
static
inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end)
{ … }

// constructors and destructor -------------------------------------------

U_CAPI UStringSearch * U_EXPORT2 usearch_open(const char16_t *pattern,
                                          int32_t         patternlength,
                                    const char16_t       *text,
                                          int32_t         textlength,
                                    const char           *locale,
                                          UBreakIterator *breakiter,
                                          UErrorCode     *status)
{ … }

U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
                                  const char16_t       *pattern,
                                        int32_t         patternlength,
                                  const char16_t       *text,
                                        int32_t         textlength,
                                  const UCollator      *collator,
                                        UBreakIterator *breakiter,
                                        UErrorCode     *status)
{ … }

U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
{ … }

namespace {

UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { … }

}

// set and get methods --------------------------------------------------

U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
                                        int32_t        position,
                                        UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
{ … }

U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch        *strsrch,
                                           USearchAttribute      attribute,
                                           USearchAttributeValue value,
                                           UErrorCode           *status)
{ … }

U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
                                                const UStringSearch *strsrch,
                                                USearchAttribute attribute)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
                                                const UStringSearch *strsrch)
{ … }


U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
                                            char16_t      *result,
                                            int32_t        resultCapacity,
                                            UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
                                              const UStringSearch *strsrch)
{ … }

#if !UCONFIG_NO_BREAK_ITERATION

U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
                                               UBreakIterator *breakiter,
                                               UErrorCode     *status)
{ … }

U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch *strsrch)
{ … }

#endif

U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
                                      const char16_t      *text,
                                            int32_t        textlength,
                                            UErrorCode    *status)
{ … }

U_CAPI const char16_t * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
                                                     int32_t       *length)
{ … }

U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
                                          const UCollator     *collator,
                                                UErrorCode    *status)
{ … }

U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
{ … }

U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
                                         const char16_t      *pattern,
                                               int32_t        patternlength,
                                               UErrorCode    *status)
{ … }

U_CAPI const char16_t* U_EXPORT2
usearch_getPattern(const UStringSearch *strsrch,
                   int32_t             *length)
{ … }

// miscellaneous methods --------------------------------------------------

U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
                                       UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
                                           int32_t        position,
                                           UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
                                      UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
                                           int32_t        position,
                                           UErrorCode    *status)
{ … }

/**
* If a direction switch is required, we'll count the number of ces till the
* beginning of the collation element iterator and iterate forwards that
* number of times. This is so that we get to the correct point within the
* string to continue the search in. Imagine when we are in the middle of the
* normalization buffer when the change in direction is request. arrrgghh....
* After searching the offset within the collation element iterator will be
* shifted to the start of the match. If a match is not found, the offset would
* have been set to the end of the text string in the collation element
* iterator.
* Okay, here's my take on normalization buffer. The only time when there can
* be 2 matches within the same normalization is when the pattern is consists
* of all accents. But since the offset returned is from the text string, we
* should not confuse the caller by returning the second match within the
* same normalization buffer. If we do, the 2 results will have the same match
* offsets, and that'll be confusing. I'll return the next match that doesn't
* fall within the same normalization buffer. Note this does not affect the
* results of matches spanning the text and the normalization buffer.
* The position to start searching is taken from the collation element
* iterator. Callers of this API would have to set the offset in the collation
* element iterator before using this method.
*/
U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
                                      UErrorCode    *status)
{ … }

U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
                                          UErrorCode    *status)
{ … }



U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
{ … }

//
//  CEI  Collation Element + source text index.
//       These structs are kept in the circular buffer.
//
struct  CEI { … };

U_NAMESPACE_BEGIN

namespace {
//
//  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
//
#define DEFAULT_CEBUFFER_SIZE …
#define CEBUFFER_EXTRA …
// Some typical max values to make buffer size more reasonable for asymmetric search.
// #8694 is for a better long-term solution to allocation of this buffer.
#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L …
#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER …
#define MIGHT_BE_JAMO_L(c) …
struct CEIBuffer { … };


CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { … }

// TODO: add a reset or init function so that allocated
//       buffers can be retained & reused.

CEIBuffer::~CEIBuffer() { … }


// Get the CE with the specified index.
//   Index must be in the range
//          n-history_size < index < n+1
//   where n is the largest index to have been fetched by some previous call to this function.
//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
//
const CEI *CEIBuffer::get(int32_t index) { … }

// Get the CE with the specified index.
//   Index must be in the range
//          n-history_size < index < n+1
//   where n is the largest index to have been fetched by some previous call to this function.
//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
//
const CEI *CEIBuffer::getPrevious(int32_t index) { … }

}

U_NAMESPACE_END


// #define USEARCH_DEBUG

#ifdef USEARCH_DEBUG
#include <stdio.h>
#include <stdlib.h>
#endif

/*
 * Find the next break boundary after startIndex. If the UStringSearch object
 * has an external break iterator, use that. Otherwise use the internal character
 * break iterator.
 */
static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) { … }

/*
 * Returns true if index is on a break boundary. If the UStringSearch
 * has an external break iterator, test using that, otherwise test
 * using the internal character break iterator.
 */
static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) { … }

#if 0
static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status)
{
    if (U_FAILURE(status)) {
        return true;
    }

#if !UCONFIG_NO_BREAK_ITERATION
    UBreakIterator *breakiterator = getBreakIterator(strsrch, status);
    if (U_SUCCESS(status)) {
        int32_t startindex = ubrk_first(breakiterator);
        int32_t endindex   = ubrk_last(breakiterator);

        // out-of-range indexes are never boundary positions
        if (start < startindex || start > endindex ||
            end < startindex || end > endindex) {
            return false;
        }

        return ubrk_isBoundary(breakiterator, start) &&
               ubrk_isBoundary(breakiterator, end);
    }
#endif

    return true;
}
#endif

UCompareCEsResult;
#define U_CE_LEVEL2_BASE …
#define U_CE_LEVEL3_BASE …

static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { … }

namespace {

UChar32 codePointAt(const USearch &search, int32_t index) { … }

UChar32 codePointBefore(const USearch &search, int32_t index) { … }

}  // namespace

U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
                                       int32_t        startIdx,
                                       int32_t        *matchStart,
                                       int32_t        *matchLimit,
                                       UErrorCode     *status)
{ … }

U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
                                                int32_t        startIdx,
                                                int32_t        *matchStart,
                                                int32_t        *matchLimit,
                                                UErrorCode     *status)
{ … }

// internal use methods declared in usrchimp.h -----------------------------

UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
{ … }

UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
{ … }

UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
{ … }

UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
                                      UErrorCode    *status)
{ … }

#endif /* #if !UCONFIG_NO_COLLATION */
chromium/third_party/icu/source/i18n/usearch.cpp