//===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines base structures for parsing & modeling a grammar for a // programming language: // // # This is a fake C++ BNF grammar // _ := translation-unit // translation-unit := declaration-seq_opt // declaration-seq := declaration // declaration-seq := declaration-seq declaration // // A grammar formally describes a language, and it is constructed by a set of // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either // nonterminal or terminal, identified by a SymbolID. // // Annotations are supported in a syntax form of [key=value]. They specify // attributes which are associated with either a grammar symbol (on the // right-hand side of the symbol) or a grammar rule (at the end of the rule // body). // Attributes provide a way to inject custom code into the GLR parser. Each // unique attribute value creates an extension point (identified by ExtensionID // ), and an extension point corresponds to a piece of native code. For // example, C++ grammar has a rule: // // compound_statement := { statement-seq [recover=Brackets] } // // The `recover` attribute instructs the parser that we should perform error // recovery if parsing the statement-seq fails. The `Brackets` recovery // heuristic is implemented in CXX.cpp by binding the ExtensionID for the // `Recovery` value to a specific C++ function that finds the recovery point. // // Notions about the BNF grammar: // - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # // - A rule describes how a nonterminal (left side of :=) is constructed, and // it is *per line* in the grammar file // - Terminals (also called tokens) correspond to the clang::TokenKind; they // are written in the grammar like "IDENTIFIER", "USING", "+" // - Nonterminals are specified with "lower-case" names in the grammar; they // shouldn't be nullable (has an empty sequence) // - optional symbols are supported (specified with a _opt suffix), and they // will be eliminated during the grammar parsing stage // //===----------------------------------------------------------------------===// #ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H #define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> #include <optional> #include <vector> namespace clang { namespace pseudo { // A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar. // nonterminal IDs are indexes into a table of nonterminal symbols. // Terminal IDs correspond to the clang TokenKind enum. SymbolID; // SymbolID is only 12 bits wide. // There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals. static constexpr uint16_t SymbolBits = …; static constexpr uint16_t NumTerminals = …; // SymbolIDs with the top bit set are tokens/terminals. static constexpr SymbolID TokenFlag = …; inline bool isToken(SymbolID ID) { … } inline bool isNonterminal(SymbolID ID) { … } // The terminals are always the clang tok::TokenKind (not all are used). inline tok::TokenKind symbolToToken(SymbolID SID) { … } inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { … } // An extension is a piece of native code specific to a grammar that modifies // the behavior of annotated rules. One ExtensionID is assigned for each unique // attribute value (all attributes share a namespace). ExtensionID; // A RuleID uniquely identifies a production rule in a grammar. // It is an index into a table of rules. RuleID; // There are maximum 2^12 rules. static constexpr unsigned RuleBits = …; // Represent a production rule in the grammar, e.g. // expression := a b c // ^Target ^Sequence struct Rule { … }; struct GrammarTable; // Grammar that describes a programming language, e.g. C++. It represents the // contents of the specified grammar. // It is a building block for constructing a table-based parser. class Grammar { … }; // For each nonterminal X, computes the set of terminals that begin strings // derived from X. (Known as FIRST sets in grammar-based parsers). std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &); // For each nonterminal X, computes the set of terminals that could immediately // follow X. (Known as FOLLOW sets in grammar-based parsers). std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &); // Storage for the underlying data of the Grammar. // It can be constructed dynamically (from compiling BNF file) or statically // (a compiled data-source). struct GrammarTable { … }; } // namespace pseudo } // namespace clang #endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H