chromium/third_party/icu/source/i18n/rbt_pars.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 1999-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/parsepos.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "funcrepl.h"
#include "hash.h"
#include "quant.h"
#include "rbt.h"
#include "rbt_data.h"
#include "rbt_pars.h"
#include "rbt_rule.h"
#include "strmatch.h"
#include "strrepl.h"
#include "unicode/symtable.h"
#include "tridpars.h"
#include "uvector.h"
#include "hash.h"
#include "patternprops.h"
#include "util.h"
#include "cmemory.h"
#include "uprops.h"
#include "putilimp.h"

// Operators
#define VARIABLE_DEF_OP
#define FORWARD_RULE_OP
#define REVERSE_RULE_OP
#define FWDREV_RULE_OP

// Other special characters
#define QUOTE
#define ESCAPE
#define END_OF_RULE
#define RULE_COMMENT_CHAR

#define SEGMENT_OPEN
#define SEGMENT_CLOSE
#define CONTEXT_ANTE
#define CONTEXT_POST
#define CURSOR_POS
#define CURSOR_OFFSET
#define ANCHOR_START
#define KLEENE_STAR
#define ONE_OR_MORE
#define ZERO_OR_ONE

#define DOT

static const char16_t DOT_SET[] =;

// A function is denoted &Source-Target/Variant(text)
#define FUNCTION

// Aliases for some of the syntax characters. These are provided so
// transliteration rules can be expressed in XML without clashing with
// XML syntax characters '<', '>', and '&'.
#define ALT_REVERSE_RULE_OP
#define ALT_FORWARD_RULE_OP
#define ALT_FWDREV_RULE_OP
#define ALT_FUNCTION

// Special characters disallowed at the top level
static const char16_t ILLEGAL_TOP[] =; // ")"

// Special characters disallowed within a segment
static const char16_t ILLEGAL_SEG[] =; // "{}|@"

// Special characters disallowed within a function argument
static const char16_t ILLEGAL_FUNC[] =; // "^(.*+?{}|@"

// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END       = '$';

static const char16_t gOPERATORS[] =;

static const char16_t HALF_ENDERS[] =;

// These are also used in Transliterator::toRules()
static const int32_t ID_TOKEN_LEN =;
static const char16_t   ID_TOKEN[]   =; // ':', ':'

/*
commented out until we do real ::BEGIN/::END functionality
static const int32_t BEGIN_TOKEN_LEN = 5;
static const char16_t BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'

static const int32_t END_TOKEN_LEN = 3;
static const char16_t END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
*/

U_NAMESPACE_BEGIN

//----------------------------------------------------------------------
// BEGIN ParseData
//----------------------------------------------------------------------

/**
 * This class implements the SymbolTable interface.  It is used
 * during parsing to give UnicodeSet access to variables that
 * have been defined so far.  Note that it uses variablesVector,
 * _not_ data.setVariables.
 */
class ParseData : public UMemory, public SymbolTable {};

ParseData::ParseData(const TransliterationRuleData* d,
                     const UVector* sets,
                     const Hashtable* vNames) :{}

ParseData::~ParseData() {}

/**
 * Implement SymbolTable API.
 */
const UnicodeString* ParseData::lookup(const UnicodeString& name) const {}

/**
 * Implement SymbolTable API.
 */
const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {}

/**
 * Implement SymbolTable API.  Parse out a symbol reference
 * name.
 */
UnicodeString ParseData::parseReference(const UnicodeString& text,
                                        ParsePosition& pos, int32_t limit) const {}

UBool ParseData::isMatcher(UChar32 ch) {}

/**
 * Return true if the given character is a replacer standin or a plain
 * character (non standin).
 */
UBool ParseData::isReplacer(UChar32 ch) {}

//----------------------------------------------------------------------
// BEGIN RuleHalf
//----------------------------------------------------------------------

/**
 * A class representing one side of a rule.  This class knows how to
 * parse half of a rule.  It is tightly coupled to the method
 * RuleBasedTransliterator.Parser.parseRule().
 */
class RuleHalf : public UMemory {};

RuleHalf::RuleHalf(TransliteratorParser& p) :{}

RuleHalf::~RuleHalf() {}

/**
 * Parse one side of a rule, stopping at either the limit,
 * the END_OF_RULE character, or an operator.
 * @return the index after the terminating character, or
 * if limit was reached, limit
 */
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {}
 
/**
 * Parse a section of one side of a rule, stopping at either
 * the limit, the END_OF_RULE character, an operator, or a
 * segment close character.  This method parses both a
 * top-level rule half and a segment within such a rule half.
 * It calls itself recursively to parse segments and nested
 * segments.
 * @param buf buffer into which to accumulate the rule pattern
 * characters, either literal characters from the rule or
 * standins for UnicodeMatcher objects including segments.
 * @param illegal the set of special characters that is illegal during
 * this parse.
 * @param isSegment if true, then we've already seen a '(' and
 * pos on entry points right after it.  Accumulate everything
 * up to the closing ')', put it in a segment matcher object,
 * generate a standin for it, and add the standin to buf.  As
 * a side effect, update the segments vector with a reference
 * to the segment matcher.  This works recursively for nested
 * segments.  If isSegment is false, just accumulate
 * characters into buf.
 * @return the index after the terminating character, or
 * if limit was reached, limit
 */
int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
                               UnicodeString& buf,
                               const UnicodeString& illegal,
                               UBool isSegment, UErrorCode& status) {}

/**
 * Remove context.
 */
void RuleHalf::removeContext() {}

/**
 * Return true if this half looks like valid output, that is, does not
 * contain quantifiers or other special input-only elements.
 */
UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {}

/**
 * Return true if this half looks like valid input, that is, does not
 * contain functions or other special output-only elements.
 */
UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {}

//----------------------------------------------------------------------
// PUBLIC API
//----------------------------------------------------------------------

/**
 * Constructor.
 */
TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :{}

/**
 * Destructor.
 */
TransliteratorParser::~TransliteratorParser() {}

void
TransliteratorParser::parse(const UnicodeString& rules,
                            UTransDirection transDirection,
                            UParseError& pe,
                            UErrorCode& ec) {}

/**
 * Return the compound filter parsed by parse().  Caller owns result.
 */ 
UnicodeSet* TransliteratorParser::orphanCompoundFilter() {}

//----------------------------------------------------------------------
// Private implementation
//----------------------------------------------------------------------

/**
 * Parse the given string as a sequence of rules, separated by newline
 * characters ('\n'), and cause this object to implement those rules.  Any
 * previous rules are discarded.  Typically this method is called exactly
 * once, during construction.
 * @exception IllegalArgumentException if there is a syntax error in the
 * rules
 */
void TransliteratorParser::parseRules(const UnicodeString& rule,
                                      UTransDirection theDirection,
                                      UErrorCode& status)
{}

/**
 * Set the variable range to [start, end] (inclusive).
 */
void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) {}

/**
 * Assert that the given character is NOT within the variable range.
 * If it is, return false.  This is necessary to ensure that the
 * variable range does not overlap characters used in a rule.
 */
UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {}

/**
 * Set the maximum backup to 'backup', in response to a pragma
 * statement.
 */
void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) {}

/**
 * Begin normalizing all rules using the given mode, in response
 * to a pragma statement.
 */
void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) {}

static const char16_t PRAGMA_USE[] =; // "use "

static const char16_t PRAGMA_VARIABLE_RANGE[] =; // "~variable range # #~;"

static const char16_t PRAGMA_MAXIMUM_BACKUP[] =; // "~maximum backup #~;"

static const char16_t PRAGMA_NFD_RULES[] =; // "~nfd rules~;"

static const char16_t PRAGMA_NFC_RULES[] =; // "~nfc rules~;"

/**
 * Return true if the given rule looks like a pragma.
 * @param pos offset to the first non-whitespace character
 * of the rule.
 * @param limit pointer past the last character of the rule.
 */
UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {}

/**
 * Parse a pragma.  This method assumes resemblesPragma() has
 * already returned true.
 * @param pos offset to the first non-whitespace character
 * of the rule.
 * @param limit pointer past the last character of the rule.
 * @return the position index after the final ';' of the pragma,
 * or -1 on failure.
 */
int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {}

/**
 * MAIN PARSER.  Parse the next rule in the given rule string, starting
 * at pos.  Return the index after the last character parsed.  Do not
 * parse characters at or after limit.
 *
 * Important:  The character at pos must be a non-whitespace character
 * that is not the comment character.
 *
 * This method handles quoting, escaping, and whitespace removal.  It
 * parses the end-of-rule character.  It recognizes context and cursor
 * indicators.  Once it does a lexical breakdown of the rule at pos, it
 * creates a rule object and adds it to our rule list.
 */
int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {}

/**
 * Called by main parser upon syntax error.  Search the rule string
 * for the probable end of the rule.  Of course, if the error is that
 * the end of rule marker is missing, then the rule end will not be found.
 * In any case the rule start will be correctly reported.
 * @param msg error description
 * @param rule pattern string
 * @param start position of first character of current rule
 */
int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
                                          const UnicodeString& rule,
                                          int32_t pos,
                                          UErrorCode& status)
{}

/**
 * Parse a UnicodeSet out, store it, and return the stand-in character
 * used to represent it.
 */
char16_t TransliteratorParser::parseSet(const UnicodeString& rule,
                                          ParsePosition& pos,
                                          UErrorCode& status) {}

/**
 * Generate and return a stand-in for a new UnicodeFunctor.  Store
 * the matcher (adopt it).
 */
char16_t TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) {}

/**
 * Return the standin for segment seg (1-based).
 */
char16_t TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {}

/**
 * Set the object for segment seg (1-based).
 */
void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {}

/**
 * Return the stand-in for the dot set.  It is allocated the first
 * time and reused thereafter.
 */
char16_t TransliteratorParser::getDotStandIn(UErrorCode& status) {}

/**
 * Append the value of the given variable name to the given
 * UnicodeString.
 */
void TransliteratorParser::appendVariableDef(const UnicodeString& name,
                                                  UnicodeString& buf,
                                                  UErrorCode& status) {}

/**
 * Glue method to get around access restrictions in C++.
 */
/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
    return Transliterator::createBasicInstance(id, canonID);
}*/

U_NAMESPACE_END

U_CAPI int32_t
utrans_stripRules(const char16_t *source, int32_t sourceLen, char16_t *target, UErrorCode *status) {}

#endif /* #if !UCONFIG_NO_TRANSLITERATION */