rbt_pars.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 1999-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/parsepos.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "funcrepl.h"
#include "hash.h"
#include "quant.h"
#include "rbt.h"
#include "rbt_data.h"
#include "rbt_pars.h"
#include "rbt_rule.h"
#include "strmatch.h"
#include "strrepl.h"
#include "unicode/symtable.h"
#include "tridpars.h"
#include "uvector.h"
#include "hash.h"
#include "patternprops.h"
#include "util.h"
#include "cmemory.h"
#include "uprops.h"
#include "putilimp.h"

// Operators
#define VARIABLE_DEF_OP …
#define FORWARD_RULE_OP …
#define REVERSE_RULE_OP …
#define FWDREV_RULE_OP …

// Other special characters
#define QUOTE …
#define ESCAPE …
#define END_OF_RULE …
#define RULE_COMMENT_CHAR …

#define SEGMENT_OPEN …
#define SEGMENT_CLOSE …
#define CONTEXT_ANTE …
#define CONTEXT_POST …
#define CURSOR_POS …
#define CURSOR_OFFSET …
#define ANCHOR_START …
#define KLEENE_STAR …
#define ONE_OR_MORE …
#define ZERO_OR_ONE …

#define DOT …

static const char16_t DOT_SET[] = …;

// A function is denoted &Source-Target/Variant(text)
#define FUNCTION …

// Aliases for some of the syntax characters. These are provided so
// transliteration rules can be expressed in XML without clashing with
// XML syntax characters '<', '>', and '&'.
#define ALT_REVERSE_RULE_OP …
#define ALT_FORWARD_RULE_OP …
#define ALT_FWDREV_RULE_OP …
#define ALT_FUNCTION …

// Special characters disallowed at the top level
static const char16_t ILLEGAL_TOP[] = …; // ")"

// Special characters disallowed within a segment
static const char16_t ILLEGAL_SEG[] = …; // "{}|@"

// Special characters disallowed within a function argument
static const char16_t ILLEGAL_FUNC[] = …; // "^(.*+?{}|@"

// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END       = '$';

static const char16_t gOPERATORS[] = …;

static const char16_t HALF_ENDERS[] = …;

// These are also used in Transliterator::toRules()
static const int32_t ID_TOKEN_LEN = …;
static const char16_t   ID_TOKEN[]   = …; // ':', ':'

/*
commented out until we do real ::BEGIN/::END functionality
static const int32_t BEGIN_TOKEN_LEN = 5;
static const char16_t BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'

static const int32_t END_TOKEN_LEN = 3;
static const char16_t END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
*/

U_NAMESPACE_BEGIN

//----------------------------------------------------------------------
// BEGIN ParseData
//----------------------------------------------------------------------

/**
 * This class implements the SymbolTable interface.  It is used
 * during parsing to give UnicodeSet access to variables that
 * have been defined so far.  Note that it uses variablesVector,
 * _not_ data.setVariables.
 */
class ParseData : public UMemory, public SymbolTable { … };

ParseData::ParseData(const TransliterationRuleData* d,
                     const UVector* sets,
                     const Hashtable* vNames) : … { … }

ParseData::~ParseData() { … }

/**
 * Implement SymbolTable API.
 */
const UnicodeString* ParseData::lookup(const UnicodeString& name) const { … }

/**
 * Implement SymbolTable API.
 */
const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { … }

/**
 * Implement SymbolTable API.  Parse out a symbol reference
 * name.
 */
UnicodeString ParseData::parseReference(const UnicodeString& text,
                                        ParsePosition& pos, int32_t limit) const { … }

UBool ParseData::isMatcher(UChar32 ch) { … }

/**
 * Return true if the given character is a replacer standin or a plain
 * character (non standin).
 */
UBool ParseData::isReplacer(UChar32 ch) { … }

//----------------------------------------------------------------------
// BEGIN RuleHalf
//----------------------------------------------------------------------

/**
 * A class representing one side of a rule.  This class knows how to
 * parse half of a rule.  It is tightly coupled to the method
 * RuleBasedTransliterator.Parser.parseRule().
 */
class RuleHalf : public UMemory { … };

RuleHalf::RuleHalf(TransliteratorParser& p) : … { … }

RuleHalf::~RuleHalf() { … }

/**
 * Parse one side of a rule, stopping at either the limit,
 * the END_OF_RULE character, or an operator.
 * @return the index after the terminating character, or
 * if limit was reached, limit
 */
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … }
 
/**
 * Parse a section of one side of a rule, stopping at either
 * the limit, the END_OF_RULE character, an operator, or a
 * segment close character.  This method parses both a
 * top-level rule half and a segment within such a rule half.
 * It calls itself recursively to parse segments and nested
 * segments.
 * @param buf buffer into which to accumulate the rule pattern
 * characters, either literal characters from the rule or
 * standins for UnicodeMatcher objects including segments.
 * @param illegal the set of special characters that is illegal during
 * this parse.
 * @param isSegment if true, then we've already seen a '(' and
 * pos on entry points right after it.  Accumulate everything
 * up to the closing ')', put it in a segment matcher object,
 * generate a standin for it, and add the standin to buf.  As
 * a side effect, update the segments vector with a reference
 * to the segment matcher.  This works recursively for nested
 * segments.  If isSegment is false, just accumulate
 * characters into buf.
 * @return the index after the terminating character, or
 * if limit was reached, limit
 */
int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
                               UnicodeString& buf,
                               const UnicodeString& illegal,
                               UBool isSegment, UErrorCode& status) { … }

/**
 * Remove context.
 */
void RuleHalf::removeContext() { … }

/**
 * Return true if this half looks like valid output, that is, does not
 * contain quantifiers or other special input-only elements.
 */
UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { … }

/**
 * Return true if this half looks like valid input, that is, does not
 * contain functions or other special output-only elements.
 */
UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { … }

//----------------------------------------------------------------------
// PUBLIC API
//----------------------------------------------------------------------

/**
 * Constructor.
 */
TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : … { … }

/**
 * Destructor.
 */
TransliteratorParser::~TransliteratorParser() { … }

void
TransliteratorParser::parse(const UnicodeString& rules,
                            UTransDirection transDirection,
                            UParseError& pe,
                            UErrorCode& ec) { … }

/**
 * Return the compound filter parsed by parse().  Caller owns result.
 */ 
UnicodeSet* TransliteratorParser::orphanCompoundFilter() { … }

//----------------------------------------------------------------------
// Private implementation
//----------------------------------------------------------------------

/**
 * Parse the given string as a sequence of rules, separated by newline
 * characters ('\n'), and cause this object to implement those rules.  Any
 * previous rules are discarded.  Typically this method is called exactly
 * once, during construction.
 * @exception IllegalArgumentException if there is a syntax error in the
 * rules
 */
void TransliteratorParser::parseRules(const UnicodeString& rule,
                                      UTransDirection theDirection,
                                      UErrorCode& status)
{ … }

/**
 * Set the variable range to [start, end] (inclusive).
 */
void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { … }

/**
 * Assert that the given character is NOT within the variable range.
 * If it is, return false.  This is necessary to ensure that the
 * variable range does not overlap characters used in a rule.
 */
UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { … }

/**
 * Set the maximum backup to 'backup', in response to a pragma
 * statement.
 */
void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { … }

/**
 * Begin normalizing all rules using the given mode, in response
 * to a pragma statement.
 */
void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { … }

static const char16_t PRAGMA_USE[] = …; // "use "

static const char16_t PRAGMA_VARIABLE_RANGE[] = …; // "~variable range # #~;"

static const char16_t PRAGMA_MAXIMUM_BACKUP[] = …; // "~maximum backup #~;"

static const char16_t PRAGMA_NFD_RULES[] = …; // "~nfd rules~;"

static const char16_t PRAGMA_NFC_RULES[] = …; // "~nfc rules~;"

/**
 * Return true if the given rule looks like a pragma.
 * @param pos offset to the first non-whitespace character
 * of the rule.
 * @param limit pointer past the last character of the rule.
 */
UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { … }

/**
 * Parse a pragma.  This method assumes resemblesPragma() has
 * already returned true.
 * @param pos offset to the first non-whitespace character
 * of the rule.
 * @param limit pointer past the last character of the rule.
 * @return the position index after the final ';' of the pragma,
 * or -1 on failure.
 */
int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … }

/**
 * MAIN PARSER.  Parse the next rule in the given rule string, starting
 * at pos.  Return the index after the last character parsed.  Do not
 * parse characters at or after limit.
 *
 * Important:  The character at pos must be a non-whitespace character
 * that is not the comment character.
 *
 * This method handles quoting, escaping, and whitespace removal.  It
 * parses the end-of-rule character.  It recognizes context and cursor
 * indicators.  Once it does a lexical breakdown of the rule at pos, it
 * creates a rule object and adds it to our rule list.
 */
int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { … }

/**
 * Called by main parser upon syntax error.  Search the rule string
 * for the probable end of the rule.  Of course, if the error is that
 * the end of rule marker is missing, then the rule end will not be found.
 * In any case the rule start will be correctly reported.
 * @param msg error description
 * @param rule pattern string
 * @param start position of first character of current rule
 */
int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
                                          const UnicodeString& rule,
                                          int32_t pos,
                                          UErrorCode& status)
{ … }

/**
 * Parse a UnicodeSet out, store it, and return the stand-in character
 * used to represent it.
 */
char16_t TransliteratorParser::parseSet(const UnicodeString& rule,
                                          ParsePosition& pos,
                                          UErrorCode& status) { … }

/**
 * Generate and return a stand-in for a new UnicodeFunctor.  Store
 * the matcher (adopt it).
 */
char16_t TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { … }

/**
 * Return the standin for segment seg (1-based).
 */
char16_t TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { … }

/**
 * Set the object for segment seg (1-based).
 */
void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { … }

/**
 * Return the stand-in for the dot set.  It is allocated the first
 * time and reused thereafter.
 */
char16_t TransliteratorParser::getDotStandIn(UErrorCode& status) { … }

/**
 * Append the value of the given variable name to the given
 * UnicodeString.
 */
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
                                                  UnicodeString& buf,
                                                  UErrorCode& status) { … }

/**
 * Glue method to get around access restrictions in C++.
 */
/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
    return Transliterator::createBasicInstance(id, canonID);
}*/

U_NAMESPACE_END

U_CAPI int32_t
utrans_stripRules(const char16_t *source, int32_t sourceLen, char16_t *target, UErrorCode *status) { … }

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
chromium/third_party/icu/source/i18n/rbt_pars.cpp