//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Tokens are the first level of abstraction above bytes used in pseudoparsing. // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). // The tokens is wrapped into pseudo::Token, along with line/indent info. // // Unlike clang, we make multiple passes over the whole file, out-of-order. // Therefore we retain the whole token sequence in memory. (This is feasible as // we process one file at a time). pseudo::TokenStream holds such a stream. // The initial stream holds the raw tokens read from the file, later passes // operate on derived TokenStreams (e.g. with directives stripped). // // Similar facilities from clang that are *not* used: // - SourceManager: designed around multiple files and precise macro expansion. // - clang::Token: coupled to SourceManager, doesn't retain layout info. // (pseudo::Token is similar, but without SourceLocations). // - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. // (pseudo::TokenStream is similar, but a flat token list). // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H #include "clang/Basic/LLVM.h" #include "clang/Basic/LangStandard.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> #include <limits> #include <memory> #include <vector> namespace clang { class LangOptions; namespace clangd { /// A single C++ or preprocessor token. /// /// Unlike clang::Token and syntax::Token, these tokens are not connected to a /// SourceManager - we are not dealing with multiple files. struct Token { … }; static_assert …; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); /// A half-open range of tokens within a stream. struct Token::Range { … }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); /// A complete sequence of Tokens representing a source file. /// /// This may match a raw file from disk, or be derived from a previous stream. /// For example, stripping comments from a TokenStream results in a new stream. /// /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: /// int main ( ) ; /// eof kw_int ident l_paren r_paren semi eof /// front() back() /// 0 1 2 3 4 5 class TokenStream { … }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); /// Extracts a raw token stream from the source code. /// /// All tokens will reference the data of the provided string. /// "word-like" tokens such as identifiers and keywords will be raw_identifier. TokenStream lex(const std::string &, const clang::LangOptions &); enum class LexFlags : uint8_t { … }; /// A generic lang options suitable for lexing/parsing a langage. clang::LangOptions genericLangOpts( clang::Language = clang::Language::CXX, clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); /// Decoding raw tokens written in the source code, returning a derived stream. /// /// - escaped newlines within tokens are removed /// - trigraphs are replaced with the characters they encode /// - UCNs within raw_identifiers are replaced by the characters they encode /// (UCNs within strings, comments etc are not translated) /// - raw_identifier tokens are assigned their correct keyword type /// - the >> token is split into separate > > tokens /// (we use a modified grammar where >> is a nonterminal, not a token) /// /// The StartsPPLine flag is preserved. /// /// Formally the identifier correctly happens before preprocessing, while we /// should only cook raw_identifiers that survive preprocessing. /// However, ignoring the Token::Kind of tokens in directives achieves the same. /// (And having cooked token kinds in PP-disabled sections is useful for us). TokenStream cook(const TokenStream &, const clang::LangOptions &); /// Drops comment tokens. TokenStream stripComments(const TokenStream &); } // namespace clangd } // namespace clang #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H