llvm/clang/include/clang/Tooling/Syntax/Tokens.h

//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Record tokens that a preprocessor emits and define operations to map between
// the tokens written in a file and tokens produced by the preprocessor.
//
// When running the compiler, there are two token streams we are interested in:
//   - "spelled" tokens directly correspond to a substring written in some
//     source file.
//   - "expanded" tokens represent the result of preprocessing, parses consumes
//     this token stream to produce the AST.
//
// Expanded tokens correspond directly to locations found in the AST, allowing
// to find subranges of the token stream covered by various AST nodes. Spelled
// tokens correspond directly to the source code written by the user.
//
// To allow composing these two use-cases, we also define operations that map
// between expanded and spelled tokens that produced them (macro calls,
// directives, etc).
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H

#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <tuple>

namespace clang {
class Preprocessor;

namespace syntax {

/// A half-open character range inside a particular file, the start offset is
/// included and the end offset is excluded from the range.
struct FileRange {};

/// For debugging purposes.
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);

/// A token coming directly from a file or from a macro invocation. Has just
/// enough information to locate the token in the source code.
/// Can represent both expanded and spelled tokens.
class Token {};
/// For debugging purposes. Equivalent to a call to Token::str().
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);

/// A list of tokens obtained by preprocessing a text buffer and operations to
/// map between the expanded and spelled tokens, i.e. TokenBuffer has
/// information about two token streams:
///    1. Expanded tokens: tokens produced by the preprocessor after all macro
///       replacements,
///    2. Spelled tokens: corresponding directly to the source code of a file
///       before any macro replacements occurred.
/// Here's an example to illustrate a difference between those two:
///     #define FOO 10
///     int a = FOO;
///
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
/// Expanded tokens are {'int','a','=','10',';','eof'}.
///
/// Note that the expanded token stream has a tok::eof token at the end, the
/// spelled tokens never store a 'eof' token.
///
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
/// tokens for each of the files can be obtained via spelledTokens(FileID).
///
/// To map between the expanded and spelled tokens use findSpelledByExpanded().
///
/// To build a token buffer use the TokenCollector class. You can also compute
/// the spelled tokens of a file using the tokenize() helper.
///
/// FIXME: allow mappings into macro arguments.
class TokenBuffer {};

/// The spelled tokens that overlap or touch a spelling location Loc.
/// This always returns 0-2 tokens.
llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);

/// The identifier token that overlaps or touches a spelling location Loc.
/// If there is none, returns nullptr.
const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,
                          llvm::ArrayRef<syntax::Token> Tokens);
const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,
                          const syntax::TokenBuffer &Tokens);

/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
/// setting the appropriate token kind (instead of the raw_identifier reported
/// by lexer in raw mode). This is a very low-level function, most users should
/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
/// results from what one might expect when running a C++ frontend, e.g.
/// preprocessor does not run at all.
/// The result will *not* have a 'eof' token at the end.
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
                                    const LangOptions &LO);
/// Similar to one above, instead of whole file tokenizes a part of it. Note
/// that, the first token might be incomplete if FR.startOffset is not at the
/// beginning of a token, and the last token returned will start before the
/// FR.endOffset but might end after it.
std::vector<syntax::Token>
tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);

/// Collects tokens for the main file while running the frontend action. An
/// instance of this object should be created on
/// FrontendAction::BeginSourceFile() and the results should be consumed after
/// FrontendAction::Execute() finishes.
class TokenCollector {};

} // namespace syntax
} // namespace clang

#endif