GLR.h | Explore in Territory

//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements a standard Generalized LR (GLR) parsing algorithm.
//
// The GLR parser behaves as a normal LR parser until it encounters a conflict.
// To handle a conflict (where there are multiple actions could perform), the
// parser will simulate nondeterminism by doing a breadth-first search
// over all the possibilities.
//
// Basic mechanisims of the GLR parser:
//  - A number of processes are operated in parallel.
//  - Each process has its own parsing stack and behaves as a standard
//    determinism LR parser.
//  - When a process encounters a conflict, it will be fork (one for each
//    avaiable action).
//  - When a process encounters an error, it is abandoned.
//  - All process are synchronized by the lookahead token: they perfrom shift
//    action at the same time, which means some processes need wait until other
//    processes have performed all reduce actions.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GLR_H
#define CLANG_PSEUDO_GLR_H

#include "clang-pseudo/Forest.h"
#include "clang-pseudo/Language.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "clang-pseudo/grammar/LRTable.h"
#include "llvm/Support/Allocator.h"
#include <vector>

namespace clang {
namespace pseudo {

// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
// parser.
//
// Each node stores a parse state, the last parsed ForestNode, and the parent
// node. There may be several heads (top of stack), and the parser operates by:
// - shift: pushing terminal symbols on top of the stack
// - reduce: replace N symbols on top of the stack with one nonterminal
//
// The structure is a DAG rather than a linear stack:
// - GLR allows multiple actions (conflicts) on the same head, producing forks
//   where several nodes have the same parent
// - The parser merges nodes with the same (state, ForestNode), producing joins
//   where one node has multiple parents
//
// The parser is responsible for creating nodes and keeping track of the set of
// heads. The GSS class is mostly an arena for them.
struct GSS { … };
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);

// Parameters for the GLR parsing.
struct ParseParams { … };

// Parses the given token stream as the start symbol with the GLR algorithm,
// and returns a forest node of the start symbol.
//
// A rule `_ := StartSymbol` must exit for the chosen start symbol.
//
// If the parsing fails, we model it as an opaque node in the forest.
ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
                     const Language &Lang);

// Shift a token onto all OldHeads, placing the results into NewHeads.
//
// Exposed for testing only.
void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
              const ForestNode &NextTok, const ParseParams &Params,
              const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
// Applies available reductions on Heads, appending resulting heads to the list.
//
// Exposed for testing only.
void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
               const ParseParams &Params, const Language &Lang);

// Heuristically recover from a state where no further parsing is possible.
//
// OldHeads is the parse state at TokenIndex.
// This function consumes zero or more tokens by advancing TokenIndex,
// and places any recovery states created in NewHeads.
//
// On failure, NewHeads is empty and TokenIndex is unchanged.
//
// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
// there is a risk of the parser falling into an infinite loop, creating an
// endless sequence of recovery nodes.
// Generally it is safe for recovery to match 0 tokens against sequence symbols
// like `statement-seq`, as the grammar won't permit another statement-seq
// immediately afterwards. However recovery strategies for `statement` should
// consume at least one token, as statements may be adjacent in the input.
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
                unsigned &TokenIndex, const ParseParams &Params,
                const Language &Lang, std::vector<const GSS::Node *> &NewHeads);

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GLR_H
llvm/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h