rbbiscan.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
//  file:  rbbiscan.cpp
//
//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains the Rule Based Break Iterator Rule Builder functions for
//   scanning the rules and assembling a parse tree.  This is the first phase
//   of compiling the rules.
//
//  The overall of the rules is managed by class RBBIRuleBuilder, which will
//  create and use an instance of this class as part of the process.
//

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "cmemory.h"
#include "cstring.h"

#include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
                       //   generated by a Perl script.
#include "rbbirb.h"
#include "rbbinode.h"
#include "rbbiscan.h"
#include "rbbitblb.h"

#include "uassert.h"

//------------------------------------------------------------------------------
//
// Unicode Set init strings for each of the character classes needed for parsing a rule file.
//               (Initialized with hex values for portability to EBCDIC based machines.
//                Really ugly, but there's no good way to avoid it.)
//
//              The sets are referred to by name in the rbbirpt.txt, which is the
//              source form of the state transition table for the RBBI rule parser.
//
//------------------------------------------------------------------------------
static const char16_t gRuleSet_rule_char_pattern[]       = …;

static const char16_t gRuleSet_name_char_pattern[]       = …;

static const char16_t gRuleSet_digit_char_pattern[] = …;

static const char16_t gRuleSet_name_start_char_pattern[] = …;

static const char16_t kAny[] = …;  // "any"


U_CDECL_BEGIN
static void U_CALLCONV RBBISetTable_deleter(void *p) { … }
U_CDECL_END

U_NAMESPACE_BEGIN

//------------------------------------------------------------------------------
//
//  Constructor.
//
//------------------------------------------------------------------------------
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
{ … }



//------------------------------------------------------------------------------
//
//  Destructor
//
//------------------------------------------------------------------------------
RBBIRuleScanner::~RBBIRuleScanner() { … }

//------------------------------------------------------------------------------
//
//  doParseAction        Do some action during rule parsing.
//                       Called by the parse state machine.
//                       Actions build the parse tree and Unicode Sets,
//                       and maintain the parse stack for nested expressions.
//
//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
//                              They represent exactly the same thing.  They're separate
//                              only to work around enum forward declaration restrictions
//                              in some compilers, while at the same time avoiding multiple
//                              definitions problems.  I'm sure that there's a better way.
//
//------------------------------------------------------------------------------
UBool RBBIRuleScanner::doParseActions(int32_t action)
{ … }




//------------------------------------------------------------------------------
//
//  Error         Report a rule parse error.
//                Only report it if no previous error has been recorded.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::error(UErrorCode e) { … }




//------------------------------------------------------------------------------
//
//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
//               An entry on the stack may be as small as a single setRef node,
//               or as large as the parse tree
//               for an entire expression (this will be the one item left on the stack
//               when the parsing of an RBBI rule completes.
//
//               This function is called when a binary operator is encountered.
//               It looks back up the stack for operators that are not yet associated
//               with a right operand, and if the precedence of the stacked operator >=
//               the precedence of the current operator, binds the operand left,
//               to the previously encountered operator.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { … }




//------------------------------------------------------------------------------
//
//   findSetFor    given a UnicodeString,
//                  - find the corresponding Unicode Set  (uset node)
//                         (create one if necessary)
//                  - Set fLeftChild of the caller's node (should be a setRef node)
//                         to the uset node
//                 Maintain a hash table of uset nodes, so the same one is always used
//                    for the same string.
//                 If a "to adopt" set is provided and we haven't seen this key before,
//                    add the provided set to the hash table.
//                 If the string is one (32 bit) char in length, the set contains
//                    just one element which is the char in question.
//                 If the string is "any", return a set containing all chars.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { … }



//
//  Assorted Unicode character constants.
//     Numeric because there is no portable way to enter them as literals.
//     (Think EBCDIC).
//
static const char16_t   chCR        = …;      // New lines, for terminating comments.
static const char16_t   chLF        = …;
static const char16_t   chNEL       = …;      //    NEL newline variant
static const char16_t   chLS        = …;    //    Unicode Line Separator
static const char16_t   chApos      = …;      //  single quote, for quoted chars.
static const char16_t   chPound     = …;      // '#', introduces a comment.
static const char16_t   chBackSlash = …;      // '\'  introduces a char escape
static const char16_t   chLParen    = …;
static const char16_t   chRParen    = …;


//------------------------------------------------------------------------------
//
//  stripRules    Return a rules string without extra spaces.
//                (Comments are removed separately, during rule parsing.)
//
//------------------------------------------------------------------------------
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { … }


//------------------------------------------------------------------------------
//
//  nextCharLL    Low Level Next Char from rule input source.
//                Get a char from the input character iterator,
//                keep track of input position for error reporting.
//
//------------------------------------------------------------------------------
UChar32  RBBIRuleScanner::nextCharLL() { … }


//------------------------------------------------------------------------------
//
//   nextChar     for rules scanning.  At this level, we handle stripping
//                out comments and processing backslash character escapes.
//                The rest of the rules grammar is handled at the next level up.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { … }

//------------------------------------------------------------------------------
//
//  Parse RBBI rules.   The state machine for rules parsing is here.
//                      The state tables are hand-written in the file rbbirpt.txt,
//                      and converted to the form used here by a perl
//                      script rbbicst.pl
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::parse() { … }


//------------------------------------------------------------------------------
//
//  printNodeStack     for debugging...
//
//------------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBIRuleScanner::printNodeStack(const char *title) {
    int i;
    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
    for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
}
#endif




//------------------------------------------------------------------------------
//
//  pushNewNode   create a new RBBINode of the specified type and push it
//                onto the stack of nodes.
//
//------------------------------------------------------------------------------
RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) { … }



//------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             A new RBBI setref node referring to the set is pushed onto the node
//             stack.
//
//             The scan position is normally under the control of the state machine
//             that controls rule parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the RBBI rule parser.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() { … }

int32_t RBBIRuleScanner::numRules() { … }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
godot/thirdparty/icu4c/common/rbbiscan.cpp