//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Record tokens that a preprocessor emits and define operations to map between // the tokens written in a file and tokens produced by the preprocessor. // // When running the compiler, there are two token streams we are interested in: // - "spelled" tokens directly correspond to a substring written in some // source file. // - "expanded" tokens represent the result of preprocessing, parses consumes // this token stream to produce the AST. // // Expanded tokens correspond directly to locations found in the AST, allowing // to find subranges of the token stream covered by various AST nodes. Spelled // tokens correspond directly to the source code written by the user. // // To allow composing these two use-cases, we also define operations that map // between expanded and spelled tokens that produced them (macro calls, // directives, etc). // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/TokenKinds.h" #include "clang/Lex/Token.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> #include <tuple> namespace clang { class Preprocessor; namespace syntax { /// A half-open character range inside a particular file, the start offset is /// included and the end offset is excluded from the range. struct FileRange { … }; /// For debugging purposes. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); /// A token coming directly from a file or from a macro invocation. Has just /// enough information to locate the token in the source code. /// Can represent both expanded and spelled tokens. class Token { … }; /// For debugging purposes. Equivalent to a call to Token::str(). llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); /// A list of tokens obtained by preprocessing a text buffer and operations to /// map between the expanded and spelled tokens, i.e. TokenBuffer has /// information about two token streams: /// 1. Expanded tokens: tokens produced by the preprocessor after all macro /// replacements, /// 2. Spelled tokens: corresponding directly to the source code of a file /// before any macro replacements occurred. /// Here's an example to illustrate a difference between those two: /// #define FOO 10 /// int a = FOO; /// /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. /// Expanded tokens are {'int','a','=','10',';','eof'}. /// /// Note that the expanded token stream has a tok::eof token at the end, the /// spelled tokens never store a 'eof' token. /// /// The full list expanded tokens can be obtained with expandedTokens(). Spelled /// tokens for each of the files can be obtained via spelledTokens(FileID). /// /// To map between the expanded and spelled tokens use findSpelledByExpanded(). /// /// To build a token buffer use the TokenCollector class. You can also compute /// the spelled tokens of a file using the tokenize() helper. /// /// FIXME: allow mappings into macro arguments. class TokenBuffer { … }; /// The spelled tokens that overlap or touch a spelling location Loc. /// This always returns 0-2 tokens. llvm::ArrayRef<syntax::Token> spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); llvm::ArrayRef<syntax::Token> spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); /// The identifier token that overlaps or touches a spelling location Loc. /// If there is none, returns nullptr. const syntax::Token * spelledIdentifierTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); const syntax::Token * spelledIdentifierTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); /// Lex the text buffer, corresponding to \p FID, in raw mode and record the /// resulting spelled tokens. Does minimal post-processing on raw identifiers, /// setting the appropriate token kind (instead of the raw_identifier reported /// by lexer in raw mode). This is a very low-level function, most users should /// prefer to use TokenCollector. Lexing in raw mode produces wildly different /// results from what one might expect when running a C++ frontend, e.g. /// preprocessor does not run at all. /// The result will *not* have a 'eof' token at the end. std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO); /// Similar to one above, instead of whole file tokenizes a part of it. Note /// that, the first token might be incomplete if FR.startOffset is not at the /// beginning of a token, and the last token returned will start before the /// FR.endOffset but might end after it. std::vector<syntax::Token> tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); /// Collects tokens for the main file while running the frontend action. An /// instance of this object should be created on /// FrontendAction::BeginSourceFile() and the results should be consumed after /// FrontendAction::Execute() finishes. class TokenCollector { … }; } // namespace syntax } // namespace clang #endif