// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unormcmp.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2004sep13 * created by: Markus W. Scherer * * unorm_compare() function moved here from unorm.cpp for better modularization. * Depends on both normalization and case folding. * Allows unorm.cpp to not depend on any character properties code. */ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/unorm.h" #include "unicode/ustring.h" #include "cmemory.h" #include "normalizer2impl.h" #include "ucase.h" #include "uprops.h" #include "ustr_imp.h" U_NAMESPACE_USE /* compare canonically equivalent ------------------------------------------- */ /* * Compare two strings for canonical equivalence. * Further options include case-insensitive comparison and * code point order (as opposed to code unit order). * * In this function, canonical equivalence is optional as well. * If canonical equivalence is tested, then both strings must fulfill * the FCD check. * * Semantically, this is equivalent to * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) * where code point order, NFD and foldCase are all optional. * * String comparisons almost always yield results before processing both strings * completely. * They are generally more efficient working incrementally instead of * performing the sub-processing (strlen, normalization, case-folding) * on the entire strings first. * * It is also unnecessary to not normalize identical characters. * * This function works in principle as follows: * * loop { * get one code unit c1 from s1 (-1 if end of source) * get one code unit c2 from s2 (-1 if end of source) * * if(either string finished) { * return result; * } * if(c1==c2) { * continue; * } * * // c1!=c2 * try to decompose/case-fold c1/c2, and continue if one does; * * // still c1!=c2 and neither decomposes/case-folds, return result * return c1-c2; * } * * When a character decomposes, then the pointer for that source changes to * the decomposition, pushing the previous pointer onto a stack. * When the end of the decomposition is reached, then the code unit reader * pops the previous source from the stack. * (Same for case-folding.) * * This is complicated further by operating on variable-width UTF-16. * The top part of the loop works on code units, while lookups for decomposition * and case-folding need code points. * Code points are assembled after the equality/end-of-source part. * The source pointer is only advanced beyond all code units when the code point * actually decomposes/case-folds. * * If we were on a trail surrogate unit when assembling a code point, * and the code point decomposes/case-folds, then the decomposition/folding * result must be compared with the part of the other string that corresponds to * this string's lead surrogate. * Since we only assemble a code point when hitting a trail unit when the * preceding lead units were identical, we back up the other string by one unit * in such a case. * * The optional code point order comparison at the end works with * the same fix-up as the other code point order comparison functions. * See ustring.c and the comment near the end of this function. * * Assumption: A decomposition or case-folding result string never contains * a single surrogate. This is a safe assumption in the Unicode Standard. * Therefore, we do not need to check for surrogate pairs across * decomposition/case-folding boundaries. * * Further assumptions (see verifications tstnorm.cpp): * The API function checks for FCD first, while the core function * first case-folds and then decomposes. This requires that case-folding does not * un-FCD any strings. * * The API function may also NFD the input and turn off decomposition. * This requires that case-folding does not un-NFD strings either. * * TODO If any of the above two assumptions is violated, * then this entire code must be re-thought. * If this happens, then a simple solution is to case-fold both strings up front * and to turn off UNORM_INPUT_IS_FCD. * We already do this when not both strings are in FCD because makeFCD * would be a partial NFD before the case folding, which does not work. * Note that all of this is only a problem when case-folding _and_ * canonical equivalence come together. * (Comments in unorm_compare() are more up to date than this TODO.) */ /* stack element for previous-level source/decomposition pointers */ struct CmpEquivLevel { … }; CmpEquivLevel; /** * Internal option for unorm_cmpEquivFold() for decomposing. * If not set, just do strcasecmp(). */ #define _COMPARE_EQUIV … /* internal function */ static int32_t unorm_cmpEquivFold(const char16_t *s1, int32_t length1, const char16_t *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { … } static UBool _normalize(const Normalizer2 *n2, const char16_t *s, int32_t length, UnicodeString &normalized, UErrorCode *pErrorCode) { … } U_CAPI int32_t U_EXPORT2 unorm_compare(const char16_t *s1, int32_t length1, const char16_t *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { … } #endif /* #if !UCONFIG_NO_NORMALIZATION */