Grammar.h | Explore in Territory

//===--- Grammar.h - grammar used by clang pseudoparser  ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  This file defines base structures for parsing & modeling a grammar for a
//  programming language:
//
//    # This is a fake C++ BNF grammar
//    _ := translation-unit
//    translation-unit := declaration-seq_opt
//    declaration-seq := declaration
//    declaration-seq := declaration-seq declaration
//
//  A grammar formally describes a language, and it is constructed by a set of
//  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
//  nonterminal or terminal, identified by a SymbolID.
//
//  Annotations are supported in a syntax form of [key=value]. They specify
//  attributes which are associated with either a grammar symbol (on the
//  right-hand side of the symbol) or a grammar rule (at the end of the rule
//  body).
//  Attributes provide a way to inject custom code into the GLR parser. Each
//  unique attribute value creates an extension point (identified by ExtensionID
//  ), and an extension point corresponds to a piece of native code. For
//  example, C++ grammar has a rule:
//
//   compound_statement := { statement-seq [recover=Brackets] }
//
//  The `recover` attribute instructs the parser that we should perform error
//  recovery if parsing the statement-seq fails. The `Brackets` recovery
//  heuristic is implemented in CXX.cpp by binding the ExtensionID for the
//  `Recovery` value to a specific C++ function that finds the recovery point.
//
//  Notions about the BNF grammar:
//  - "_" is the start symbol of the augmented grammar;
//  - single-line comment is supported, starting with a #
//  - A rule describes how a nonterminal (left side of :=) is constructed, and
//    it is *per line* in the grammar file
//  - Terminals (also called tokens) correspond to the clang::TokenKind; they
//    are written in the grammar like "IDENTIFIER", "USING", "+"
//  - Nonterminals are specified with "lower-case" names in the grammar; they
//    shouldn't be nullable (has an empty sequence)
//  - optional symbols are supported (specified with a _opt suffix), and they
//    will be eliminated during the grammar parsing stage
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H

#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <optional>
#include <vector>

namespace clang {
namespace pseudo {
// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
// nonterminal IDs are indexes into a table of nonterminal symbols.
// Terminal IDs correspond to the clang TokenKind enum.
SymbolID;
// SymbolID is only 12 bits wide.
// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
static constexpr uint16_t SymbolBits = …;
static constexpr uint16_t NumTerminals = …;
// SymbolIDs with the top bit set are tokens/terminals.
static constexpr SymbolID TokenFlag = …;
inline bool isToken(SymbolID ID) { … }
inline bool isNonterminal(SymbolID ID) { … }
// The terminals are always the clang tok::TokenKind (not all are used).
inline tok::TokenKind symbolToToken(SymbolID SID) { … }
inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { … }

// An extension is a piece of native code specific to a grammar that modifies
// the behavior of annotated rules. One ExtensionID is assigned for each unique
// attribute value (all attributes share a namespace).
ExtensionID;

// A RuleID uniquely identifies a production rule in a grammar.
// It is an index into a table of rules.
RuleID;
// There are maximum 2^12 rules.
static constexpr unsigned RuleBits = …;

// Represent a production rule in the grammar, e.g.
//   expression := a b c
//   ^Target       ^Sequence
struct Rule { … };

struct GrammarTable;

// Grammar that describes a programming language, e.g. C++. It represents the
// contents of the specified grammar.
// It is a building block for constructing a table-based parser.
class Grammar { … };
// For each nonterminal X, computes the set of terminals that begin strings
// derived from X. (Known as FIRST sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
// For each nonterminal X, computes the set of terminals that could immediately
// follow X. (Known as FOLLOW sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);

// Storage for the underlying data of the Grammar.
// It can be constructed dynamically (from compiling BNF file) or statically
// (a compiled data-source).
struct GrammarTable { … };

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
llvm/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h