// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uobject.h" #include "unicode/parseerr.h" #include "unicode/parsepos.h" #include "unicode/putil.h" #include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/uniset.h" #include "unicode/utf16.h" #include "cstring.h" #include "funcrepl.h" #include "hash.h" #include "quant.h" #include "rbt.h" #include "rbt_data.h" #include "rbt_pars.h" #include "rbt_rule.h" #include "strmatch.h" #include "strrepl.h" #include "unicode/symtable.h" #include "tridpars.h" #include "uvector.h" #include "hash.h" #include "patternprops.h" #include "util.h" #include "cmemory.h" #include "uprops.h" #include "putilimp.h" // Operators #define VARIABLE_DEF_OP … #define FORWARD_RULE_OP … #define REVERSE_RULE_OP … #define FWDREV_RULE_OP … // Other special characters #define QUOTE … #define ESCAPE … #define END_OF_RULE … #define RULE_COMMENT_CHAR … #define SEGMENT_OPEN … #define SEGMENT_CLOSE … #define CONTEXT_ANTE … #define CONTEXT_POST … #define CURSOR_POS … #define CURSOR_OFFSET … #define ANCHOR_START … #define KLEENE_STAR … #define ONE_OR_MORE … #define ZERO_OR_ONE … #define DOT … static const char16_t DOT_SET[] = …; // A function is denoted &Source-Target/Variant(text) #define FUNCTION … // Aliases for some of the syntax characters. These are provided so // transliteration rules can be expressed in XML without clashing with // XML syntax characters '<', '>', and '&'. #define ALT_REVERSE_RULE_OP … #define ALT_FORWARD_RULE_OP … #define ALT_FWDREV_RULE_OP … #define ALT_FUNCTION … // Special characters disallowed at the top level static const char16_t ILLEGAL_TOP[] = …; // ")" // Special characters disallowed within a segment static const char16_t ILLEGAL_SEG[] = …; // "{}|@" // Special characters disallowed within a function argument static const char16_t ILLEGAL_FUNC[] = …; // "^(.*+?{}|@" // By definition, the ANCHOR_END special character is a // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; static const char16_t gOPERATORS[] = …; static const char16_t HALF_ENDERS[] = …; // These are also used in Transliterator::toRules() static const int32_t ID_TOKEN_LEN = …; static const char16_t ID_TOKEN[] = …; // ':', ':' /* commented out until we do real ::BEGIN/::END functionality static const int32_t BEGIN_TOKEN_LEN = 5; static const char16_t BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' static const int32_t END_TOKEN_LEN = 3; static const char16_t END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' */ U_NAMESPACE_BEGIN //---------------------------------------------------------------------- // BEGIN ParseData //---------------------------------------------------------------------- /** * This class implements the SymbolTable interface. It is used * during parsing to give UnicodeSet access to variables that * have been defined so far. Note that it uses variablesVector, * _not_ data.setVariables. */ class ParseData : public UMemory, public SymbolTable { … }; ParseData::ParseData(const TransliterationRuleData* d, const UVector* sets, const Hashtable* vNames) : … { … } ParseData::~ParseData() { … } /** * Implement SymbolTable API. */ const UnicodeString* ParseData::lookup(const UnicodeString& name) const { … } /** * Implement SymbolTable API. */ const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { … } /** * Implement SymbolTable API. Parse out a symbol reference * name. */ UnicodeString ParseData::parseReference(const UnicodeString& text, ParsePosition& pos, int32_t limit) const { … } UBool ParseData::isMatcher(UChar32 ch) { … } /** * Return true if the given character is a replacer standin or a plain * character (non standin). */ UBool ParseData::isReplacer(UChar32 ch) { … } //---------------------------------------------------------------------- // BEGIN RuleHalf //---------------------------------------------------------------------- /** * A class representing one side of a rule. This class knows how to * parse half of a rule. It is tightly coupled to the method * RuleBasedTransliterator.Parser.parseRule(). */ class RuleHalf : public UMemory { … }; RuleHalf::RuleHalf(TransliteratorParser& p) : … { … } RuleHalf::~RuleHalf() { … } /** * Parse one side of a rule, stopping at either the limit, * the END_OF_RULE character, or an operator. * @return the index after the terminating character, or * if limit was reached, limit */ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … } /** * Parse a section of one side of a rule, stopping at either * the limit, the END_OF_RULE character, an operator, or a * segment close character. This method parses both a * top-level rule half and a segment within such a rule half. * It calls itself recursively to parse segments and nested * segments. * @param buf buffer into which to accumulate the rule pattern * characters, either literal characters from the rule or * standins for UnicodeMatcher objects including segments. * @param illegal the set of special characters that is illegal during * this parse. * @param isSegment if true, then we've already seen a '(' and * pos on entry points right after it. Accumulate everything * up to the closing ')', put it in a segment matcher object, * generate a standin for it, and add the standin to buf. As * a side effect, update the segments vector with a reference * to the segment matcher. This works recursively for nested * segments. If isSegment is false, just accumulate * characters into buf. * @return the index after the terminating character, or * if limit was reached, limit */ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, UnicodeString& buf, const UnicodeString& illegal, UBool isSegment, UErrorCode& status) { … } /** * Remove context. */ void RuleHalf::removeContext() { … } /** * Return true if this half looks like valid output, that is, does not * contain quantifiers or other special input-only elements. */ UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { … } /** * Return true if this half looks like valid input, that is, does not * contain functions or other special output-only elements. */ UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { … } //---------------------------------------------------------------------- // PUBLIC API //---------------------------------------------------------------------- /** * Constructor. */ TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : … { … } /** * Destructor. */ TransliteratorParser::~TransliteratorParser() { … } void TransliteratorParser::parse(const UnicodeString& rules, UTransDirection transDirection, UParseError& pe, UErrorCode& ec) { … } /** * Return the compound filter parsed by parse(). Caller owns result. */ UnicodeSet* TransliteratorParser::orphanCompoundFilter() { … } //---------------------------------------------------------------------- // Private implementation //---------------------------------------------------------------------- /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once, during construction. * @exception IllegalArgumentException if there is a syntax error in the * rules */ void TransliteratorParser::parseRules(const UnicodeString& rule, UTransDirection theDirection, UErrorCode& status) { … } /** * Set the variable range to [start, end] (inclusive). */ void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { … } /** * Assert that the given character is NOT within the variable range. * If it is, return false. This is necessary to ensure that the * variable range does not overlap characters used in a rule. */ UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { … } /** * Set the maximum backup to 'backup', in response to a pragma * statement. */ void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { … } /** * Begin normalizing all rules using the given mode, in response * to a pragma statement. */ void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { … } static const char16_t PRAGMA_USE[] = …; // "use " static const char16_t PRAGMA_VARIABLE_RANGE[] = …; // "~variable range # #~;" static const char16_t PRAGMA_MAXIMUM_BACKUP[] = …; // "~maximum backup #~;" static const char16_t PRAGMA_NFD_RULES[] = …; // "~nfd rules~;" static const char16_t PRAGMA_NFC_RULES[] = …; // "~nfc rules~;" /** * Return true if the given rule looks like a pragma. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. */ UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { … } /** * Parse a pragma. This method assumes resemblesPragma() has * already returned true. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. * @return the position index after the final ';' of the pragma, * or -1 on failure. */ int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … } /** * MAIN PARSER. Parse the next rule in the given rule string, starting * at pos. Return the index after the last character parsed. Do not * parse characters at or after limit. * * Important: The character at pos must be a non-whitespace character * that is not the comment character. * * This method handles quoting, escaping, and whitespace removal. It * parses the end-of-rule character. It recognizes context and cursor * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. */ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … } /** * Called by main parser upon syntax error. Search the rule string * for the probable end of the rule. Of course, if the error is that * the end of rule marker is missing, then the rule end will not be found. * In any case the rule start will be correctly reported. * @param msg error description * @param rule pattern string * @param start position of first character of current rule */ int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, const UnicodeString& rule, int32_t pos, UErrorCode& status) { … } /** * Parse a UnicodeSet out, store it, and return the stand-in character * used to represent it. */ char16_t TransliteratorParser::parseSet(const UnicodeString& rule, ParsePosition& pos, UErrorCode& status) { … } /** * Generate and return a stand-in for a new UnicodeFunctor. Store * the matcher (adopt it). */ char16_t TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { … } /** * Return the standin for segment seg (1-based). */ char16_t TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { … } /** * Set the object for segment seg (1-based). */ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { … } /** * Return the stand-in for the dot set. It is allocated the first * time and reused thereafter. */ char16_t TransliteratorParser::getDotStandIn(UErrorCode& status) { … } /** * Append the value of the given variable name to the given * UnicodeString. */ void TransliteratorParser::appendVariableDef(const UnicodeString& name, UnicodeString& buf, UErrorCode& status) { … } /** * Glue method to get around access restrictions in C++. */ /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { return Transliterator::createBasicInstance(id, canonID); }*/ U_NAMESPACE_END U_CAPI int32_t utrans_stripRules(const char16_t *source, int32_t sourceLen, char16_t *target, UErrorCode *status) { … } #endif /* #if !UCONFIG_NO_TRANSLITERATION */