// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // file: rbbiscan.cpp // // Copyright (C) 2002-2016, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the Rule Based Break Iterator Rule Builder functions for // scanning the rules and assembling a parse tree. This is the first phase // of compiling the rules. // // The overall of the rules is managed by class RBBIRuleBuilder, which will // create and use an instance of this class as part of the process. // #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" #include "cmemory.h" #include "cstring.h" #include "rbbirpt.h" // Contains state table for the rbbi rules parser. // generated by a Perl script. #include "rbbirb.h" #include "rbbinode.h" #include "rbbiscan.h" #include "rbbitblb.h" #include "uassert.h" //------------------------------------------------------------------------------ // // Unicode Set init strings for each of the character classes needed for parsing a rule file. // (Initialized with hex values for portability to EBCDIC based machines. // Really ugly, but there's no good way to avoid it.) // // The sets are referred to by name in the rbbirpt.txt, which is the // source form of the state transition table for the RBBI rule parser. // //------------------------------------------------------------------------------ static const char16_t gRuleSet_rule_char_pattern[] = …; static const char16_t gRuleSet_name_char_pattern[] = …; static const char16_t gRuleSet_digit_char_pattern[] = …; static const char16_t gRuleSet_name_start_char_pattern[] = …; static const char16_t kAny[] = …; // "any" U_CDECL_BEGIN static void U_CALLCONV RBBISetTable_deleter(void *p) { … } U_CDECL_END U_NAMESPACE_BEGIN //------------------------------------------------------------------------------ // // Constructor. // //------------------------------------------------------------------------------ RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) { … } //------------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------------ RBBIRuleScanner::~RBBIRuleScanner() { … } //------------------------------------------------------------------------------ // // doParseAction Do some action during rule parsing. // Called by the parse state machine. // Actions build the parse tree and Unicode Sets, // and maintain the parse stack for nested expressions. // // TODO: unify EParseAction and RBBI_RuleParseAction enum types. // They represent exactly the same thing. They're separate // only to work around enum forward declaration restrictions // in some compilers, while at the same time avoiding multiple // definitions problems. I'm sure that there's a better way. // //------------------------------------------------------------------------------ UBool RBBIRuleScanner::doParseActions(int32_t action) { … } //------------------------------------------------------------------------------ // // Error Report a rule parse error. // Only report it if no previous error has been recorded. // //------------------------------------------------------------------------------ void RBBIRuleScanner::error(UErrorCode e) { … } //------------------------------------------------------------------------------ // // fixOpStack The parse stack holds partially assembled chunks of the parse tree. // An entry on the stack may be as small as a single setRef node, // or as large as the parse tree // for an entire expression (this will be the one item left on the stack // when the parsing of an RBBI rule completes. // // This function is called when a binary operator is encountered. // It looks back up the stack for operators that are not yet associated // with a right operand, and if the precedence of the stacked operator >= // the precedence of the current operator, binds the operand left, // to the previously encountered operator. // //------------------------------------------------------------------------------ void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { … } //------------------------------------------------------------------------------ // // findSetFor given a UnicodeString, // - find the corresponding Unicode Set (uset node) // (create one if necessary) // - Set fLeftChild of the caller's node (should be a setRef node) // to the uset node // Maintain a hash table of uset nodes, so the same one is always used // for the same string. // If a "to adopt" set is provided and we haven't seen this key before, // add the provided set to the hash table. // If the string is one (32 bit) char in length, the set contains // just one element which is the char in question. // If the string is "any", return a set containing all chars. // //------------------------------------------------------------------------------ void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { … } // // Assorted Unicode character constants. // Numeric because there is no portable way to enter them as literals. // (Think EBCDIC). // static const char16_t chCR = …; // New lines, for terminating comments. static const char16_t chLF = …; static const char16_t chNEL = …; // NEL newline variant static const char16_t chLS = …; // Unicode Line Separator static const char16_t chApos = …; // single quote, for quoted chars. static const char16_t chPound = …; // '#', introduces a comment. static const char16_t chBackSlash = …; // '\' introduces a char escape static const char16_t chLParen = …; static const char16_t chRParen = …; //------------------------------------------------------------------------------ // // stripRules Return a rules string without extra spaces. // (Comments are removed separately, during rule parsing.) // //------------------------------------------------------------------------------ UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { … } //------------------------------------------------------------------------------ // // nextCharLL Low Level Next Char from rule input source. // Get a char from the input character iterator, // keep track of input position for error reporting. // //------------------------------------------------------------------------------ UChar32 RBBIRuleScanner::nextCharLL() { … } //------------------------------------------------------------------------------ // // nextChar for rules scanning. At this level, we handle stripping // out comments and processing backslash character escapes. // The rest of the rules grammar is handled at the next level up. // //------------------------------------------------------------------------------ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { … } //------------------------------------------------------------------------------ // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // //------------------------------------------------------------------------------ void RBBIRuleScanner::parse() { … } //------------------------------------------------------------------------------ // // printNodeStack for debugging... // //------------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBIRuleScanner::printNodeStack(const char *title) { int i; RBBIDebugPrintf("%s. Dumping node stack...\n", title); for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);} } #endif //------------------------------------------------------------------------------ // // pushNewNode create a new RBBINode of the specified type and push it // onto the stack of nodes. // //------------------------------------------------------------------------------ RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { … } //------------------------------------------------------------------------------ // // scanSet Construct a UnicodeSet from the text at the current scan // position. Advance the scan position to the first character // after the set. // // A new RBBI setref node referring to the set is pushed onto the node // stack. // // The scan position is normally under the control of the state machine // that controls rule parsing. UnicodeSets, however, are parsed by // the UnicodeSet constructor, not by the RBBI rule parser. // //------------------------------------------------------------------------------ void RBBIRuleScanner::scanSet() { … } int32_t RBBIRuleScanner::numRules() { … } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */