rbt_set.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 1999-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "rbt_set.h"
#include "rbt_rule.h"
#include "cmemory.h"
#include "putilimp.h"

U_CDECL_BEGIN
static void U_CALLCONV _deleteRule(void *rule) { … }
U_CDECL_END

//----------------------------------------------------------------------
// BEGIN Debugging support
//----------------------------------------------------------------------

// #define DEBUG_RBT

#ifdef DEBUG_RBT
#include <stdio.h>
#include "charstr.h"

/**
 * @param appendTo result is appended to this param.
 * @param input the string being transliterated
 * @param pos the index struct
 */
static UnicodeString& _formatInput(UnicodeString &appendTo,
                                   const UnicodeString& input,
                                   const UTransPosition& pos) {
    // Output a string of the form aaa{bbb|ccc|ddd}eee, where
    // the {} indicate the context start and limit, and the ||
    // indicate the start and limit.
    if (0 <= pos.contextStart &&
        pos.contextStart <= pos.start &&
        pos.start <= pos.limit &&
        pos.limit <= pos.contextLimit &&
        pos.contextLimit <= input.length()) {

        UnicodeString a, b, c, d, e;
        input.extractBetween(0, pos.contextStart, a);
        input.extractBetween(pos.contextStart, pos.start, b);
        input.extractBetween(pos.start, pos.limit, c);
        input.extractBetween(pos.limit, pos.contextLimit, d);
        input.extractBetween(pos.contextLimit, input.length(), e);
        appendTo.append(a).append((char16_t)123/*{*/).append(b).
            append((char16_t)124/*|*/).append(c).append((char16_t)124/*|*/).append(d).
            append((char16_t)125/*}*/).append(e);
    } else {
        appendTo.append("INVALID UTransPosition");
        //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
        //                pos.contextStart + ", s=" + pos.start + ", l=" +
        //                pos.limit + ", cl=" + pos.contextLimit + "} on " +
        //                input);
    }
    return appendTo;
}

// Append a hex string to the target
UnicodeString& _appendHex(uint32_t number,
                          int32_t digits,
                          UnicodeString& target) {
    static const char16_t digitString[] = {
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
        0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
    };
    while (digits--) {
        target += digitString[(number >> (digits*4)) & 0xF];
    }
    return target;
}

// Replace nonprintable characters with unicode escapes
UnicodeString& _escape(const UnicodeString &source,
                       UnicodeString &target) {
    for (int32_t i = 0; i < source.length(); ) {
        UChar32 ch = source.char32At(i);
        i += U16_LENGTH(ch);
        if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {
            if (ch <= 0xFFFF) {
                target += "\\u";
                _appendHex(ch, 4, target);
            } else {
                target += "\\U";
                _appendHex(ch, 8, target);
            }
        } else {
            target += ch;
        }
    }
    return target;
}

inline void _debugOut(const char* msg, TransliterationRule* rule,
                      const Replaceable& theText, UTransPosition& pos) {
    UnicodeString buf(msg, "");
    if (rule) {
        UnicodeString r;
        rule->toRule(r, true);
        buf.append((char16_t)32).append(r);
    }
    buf.append(UnicodeString(" => ", ""));
    UnicodeString* text = (UnicodeString*)&theText;
    _formatInput(buf, *text, pos);
    UnicodeString esc;
    _escape(buf, esc);
    CharString cbuf(esc);
    printf("%s\n", (const char*) cbuf);
}

#else
#define _debugOut(msg, rule, theText, pos) …
#endif

//----------------------------------------------------------------------
// END Debugging support
//----------------------------------------------------------------------

// Fill the precontext and postcontext with the patterns of the rules
// that are masking one another.
static void maskingError(const icu::TransliterationRule& rule1,
                         const icu::TransliterationRule& rule2,
                         UParseError& parseError) { … }

U_NAMESPACE_BEGIN

/**
 * Construct a new empty rule set.
 */
TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : … { … }

/**
 * Copy constructor.
 */
TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) : … { … }

/**
 * Destructor.
 */
TransliterationRuleSet::~TransliterationRuleSet() { … }

void TransliterationRuleSet::setData(const TransliterationRuleData* d) { … }

/**
 * Return the maximum context length.
 * @return the length of the longest preceding context.
 */
int32_t TransliterationRuleSet::getMaximumContextLength() const { … }

/**
 * Add a rule to this set.  Rules are added in order, and order is
 * significant.  The last call to this method must be followed by
 * a call to <code>freeze()</code> before the rule set is used.
 *
 * <p>If freeze() has already been called, calling addRule()
 * unfreezes the rules, and freeze() must be called again.
 *
 * @param adoptedRule the rule to add
 */
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
                                     UErrorCode& status) { … }

/**
 * Check this for masked rules and index it to optimize performance.
 * The sequence of operations is: (1) add rules to a set using
 * <code>addRule()</code>; (2) freeze the set using
 * <code>freeze()</code>; (3) use the rule set.  If
 * <code>addRule()</code> is called after calling this method, it
 * invalidates this object, and this method must be called again.
 * That is, <code>freeze()</code> may be called multiple times,
 * although for optimal performance it shouldn't be.
 */
void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) { … }

/**
 * Transliterate the given text with the given UTransPosition
 * indices.  Return true if the transliteration should continue
 * or false if it should halt (because of a U_PARTIAL_MATCH match).
 * Note that false is only ever returned if isIncremental is true.
 * @param text the text to be transliterated
 * @param pos the position indices, which will be updated
 * @param incremental if true, assume new text may be inserted
 * at index.limit, and return false if there is a partial match.
 * @return true unless a U_PARTIAL_MATCH has been obtained,
 * indicating that transliteration should stop until more text
 * arrives.
 */
UBool TransliterationRuleSet::transliterate(Replaceable& text,
                                            UTransPosition& pos,
                                            UBool incremental) { … }

/**
 * Create rule strings that represents this rule set.
 */
UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
                                               UBool escapeUnprintable) const { … }

/**
 * Return the set of all characters that may be modified
 * (getTarget=false) or emitted (getTarget=true) by this set.
 */
UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
                               UBool getTarget) const
{ … }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
chromium/third_party/icu/source/i18n/rbt_set.cpp