llvm/clang-tools-extra/pseudo/include/clang-pseudo/Token.h

//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Tokens are the first level of abstraction above bytes used in pseudoparsing.
// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
// The tokens is wrapped into pseudo::Token, along with line/indent info.
//
// Unlike clang, we make multiple passes over the whole file, out-of-order.
// Therefore we retain the whole token sequence in memory. (This is feasible as
// we process one file at a time). pseudo::TokenStream holds such a stream.
// The initial stream holds the raw tokens read from the file, later passes
// operate on derived TokenStreams (e.g. with directives stripped).
//
// Similar facilities from clang that are *not* used:
//  - SourceManager: designed around multiple files and precise macro expansion.
//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
//                  (pseudo::Token is similar, but without SourceLocations).
//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
//                  (pseudo::TokenStream is similar, but a flat token list).
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_TOKEN_H
#define CLANG_PSEUDO_TOKEN_H

#include "clang/Basic/LLVM.h"
#include "clang/Basic/LangStandard.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>

namespace clang {
class LangOptions;
namespace pseudo {

/// A single C++ or preprocessor token.
///
/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
/// SourceManager - we are not dealing with multiple files.
struct Token {};
static_assert;
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);

/// A half-open range of tokens within a stream.
struct Token::Range {};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);

/// A complete sequence of Tokens representing a source file.
///
/// This may match a raw file from disk, or be derived from a previous stream.
/// For example, stripping comments from a TokenStream results in a new stream.
///
/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
///       int      main   (        )        ;
///   eof kw_int   ident  l_paren  r_paren  semi   eof
///       front()                           back()
///       0        1      2        3        4      5
class TokenStream {};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);

/// Extracts a raw token stream from the source code.
///
/// All tokens will reference the data of the provided string.
/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
TokenStream lex(const std::string &, const clang::LangOptions &);
enum class LexFlags : uint8_t {};
/// A generic lang options suitable for lexing/parsing a langage.
clang::LangOptions genericLangOpts(
    clang::Language = clang::Language::CXX,
    clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);

/// Decoding raw tokens written in the source code, returning a derived stream.
///
/// - escaped newlines within tokens are removed
/// - trigraphs are replaced with the characters they encode
/// - UCNs within raw_identifiers are replaced by the characters they encode
///   (UCNs within strings, comments etc are not translated)
/// - raw_identifier tokens are assigned their correct keyword type
/// - the >> token is split into separate > > tokens
///   (we use a modified grammar where >> is a nonterminal, not a token)
///
/// The StartsPPLine flag is preserved.
///
/// Formally the identifier correctly happens before preprocessing, while we
/// should only cook raw_identifiers that survive preprocessing.
/// However, ignoring the Token::Kind of tokens in directives achieves the same.
/// (And having cooked token kinds in PP-disabled sections is useful for us).
TokenStream cook(const TokenStream &, const clang::LangOptions &);

/// Drops comment tokens.
TokenStream stripComments(const TokenStream &);

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_TOKEN_H