// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // rbbisetb.cpp // /* *************************************************************************** * Copyright (C) 2002-2008 International Business Machines Corporation * * and others. All rights reserved. * *************************************************************************** */ // // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules // (part of the rule building process.) // // Starting with the rules parse tree from the scanner, // // - Enumerate the set of UnicodeSets that are referenced // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same // set of input unicode sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one // of these sets belong to the same set of the original // UnicodeSets from the user's rules. // - construct the trie table that maps input characters // to the index of the matching non-overlapping set of set from // the previous step. // #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/uniset.h" #include "uvector.h" #include "uassert.h" #include "cmemory.h" #include "cstring.h" #include "rbbisetb.h" #include "rbbinode.h" U_NAMESPACE_BEGIN const int32_t kMaxCharCategoriesFor8BitsTrie = …; //------------------------------------------------------------------------ // // Constructor // //------------------------------------------------------------------------ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) { … } //------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------ RBBISetBuilder::~RBBISetBuilder() { … } //------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::buildRanges() { … } // // Build the Trie table for mapping UChar32 values to the corresponding // range group number. // void RBBISetBuilder::buildTrie() { … } void RBBISetBuilder::mergeCategories(IntPair categories) { … } //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int32_t RBBISetBuilder::getTrieSize() { … } //----------------------------------------------------------------------------------- // // serializeTrie() Put the serialized trie at the specified address. // Trust the caller to have given us enough memory. // getTrieSize() MUST be called first. // //----------------------------------------------------------------------------------- void RBBISetBuilder::serializeTrie(uint8_t *where) { … } //------------------------------------------------------------------------ // // addValToSets Add a runtime-mapped input value to each uset from a // list of uset nodes. (val corresponds to a state table column.) // For each of the original Unicode sets - which correspond // directly to uset nodes - a logically equivalent expression // is constructed in terms of the remapped runtime input // symbol set. This function adds one runtime input symbol to // a list of sets. // // The "logically equivalent expression" is the tree for an // or-ing together of all of the symbols that go into the set. // //------------------------------------------------------------------------ void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { … } void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { … } //------------------------------------------------------------------------ // // getNumCharCategories // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getNumCharCategories() const { … } //------------------------------------------------------------------------ // // getDictCategoriesStart // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getDictCategoriesStart() const { … } //------------------------------------------------------------------------ // // sawBOF // //------------------------------------------------------------------------ UBool RBBISetBuilder::sawBOF() const { … } //------------------------------------------------------------------------ // // getFirstChar Given a runtime RBBI character category, find // the first UChar32 that is in the set of chars // in the category. //------------------------------------------------------------------------ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { … } //------------------------------------------------------------------------ // // printRanges A debugging function. // dump out all of the range definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRanges() { RangeDescriptor *rlRange; int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar); for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName {u"anon"}; RBBINode *setRef = usetNode->fParent; if (setRef != nullptr) { RBBINode *varRef = setRef->fParent; if (varRef != nullptr && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } RBBIDebugPrintf("\n"); } } #endif //------------------------------------------------------------------------ // // printRangeGroups A debugging function. // dump out all of the range groups. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRangeGroups() { int i; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { if (rlRange->fFirstInGroup) { int groupNum = rlRange->fNum; RBBIDebugPrintf("%2i ", groupNum); if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");} for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode *setRef = usetNode->fParent; if (setRef != nullptr) { RBBINode *varRef = setRef->fParent; if (varRef != nullptr && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } i = 0; for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); } RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); } } RBBIDebugPrintf("\n"); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------ // // printSets A debugging function. // dump out all of the set definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printSets() { int i; RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); for (i=0; ; i++) { RBBINode *usetNode; RBBINode *setRef; RBBINode *varRef; UnicodeString setName; usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); if (usetNode == nullptr) { break; } RBBIDebugPrintf("%3d ", i); setName = UNICODE_STRING("anonymous", 9); setRef = usetNode->fParent; if (setRef != nullptr) { varRef = setRef->fParent; if (varRef != nullptr && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); RBBI_DEBUG_printUnicodeString(usetNode->fText); RBBIDebugPrintf("\n"); if (usetNode->fLeftChild != nullptr) { RBBINode::printTree(usetNode->fLeftChild, true); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------------------- // // RangeDescriptor copy constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) : … { … } //------------------------------------------------------------------------------------- // // RangeDesriptor default constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(UErrorCode &status) { … } //------------------------------------------------------------------------------------- // // RangeDesriptor Destructor // //------------------------------------------------------------------------------------- RangeDescriptor::~RangeDescriptor() { … } //------------------------------------------------------------------------------------- // // RangeDesriptor::split() // //------------------------------------------------------------------------------------- void RangeDescriptor::split(UChar32 where, UErrorCode &status) { … } //------------------------------------------------------------------------------------- // // RangeDescriptor::isDictionaryRange // // Test whether this range includes characters from // the original Unicode Set named "dictionary". // // This function looks through the Unicode Sets that // the range includes, checking for one named "dictionary" // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // //------------------------------------------------------------------------------------- bool RangeDescriptor::isDictionaryRange() { … } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */