//===- Lexer.cpp - C Language Family Lexer --------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the Lexer and Token interfaces. // //===----------------------------------------------------------------------===// #include "clang/Lex/Lexer.h" #include "UnicodeCharSets.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/TokenKinds.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Lex/LiteralSupport.h" #include "clang/Lex/MultipleIncludeOpt.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Lex/Token.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Support/NativeFormatting.h" #include "llvm/Support/Unicode.h" #include "llvm/Support/UnicodeCharRanges.h" #include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <cstring> #include <optional> #include <string> #include <tuple> #include <utility> #ifdef __SSE4_2__ #include <nmmintrin.h> #endif usingnamespaceclang; //===----------------------------------------------------------------------===// // Token Class Implementation //===----------------------------------------------------------------------===// /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { … } /// getObjCKeywordID - Return the ObjC keyword kind. tok::ObjCKeywordKind Token::getObjCKeywordID() const { … } /// Determine whether the token kind starts a simple-type-specifier. bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { … } //===----------------------------------------------------------------------===// // Lexer Class Implementation //===----------------------------------------------------------------------===// void Lexer::anchor() { … } void Lexer::InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd) { … } /// Lexer constructor - Create a new lexer object for the specified buffer /// with the specified preprocessor managing the lexing process. This lexer /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile) : … { … } /// Lexer constructor - Create a new raw lexer object. This object is only /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text /// range will outlive it, so it doesn't take ownership of it. Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, const char *BufStart, const char *BufPtr, const char *BufEnd, bool IsFirstIncludeOfFile) : … { … } /// Lexer constructor - Create a new raw lexer object. This object is only /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text /// range will outlive it, so it doesn't take ownership of it. Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, const SourceManager &SM, const LangOptions &langOpts, bool IsFirstIncludeOfFile) : … { … } void Lexer::resetExtendedTokenMode() { … } /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for /// _Pragma expansion. This has a variety of magic semantics that this method /// sets up. It returns a new'd Lexer that must be delete'd when done. /// /// On entrance to this routine, TokStartLoc is a macro location which has a /// spelling loc that indicates the bytes to be lexed for the token and an /// expansion location that indicates where all lexed tokens should be /// "expanded from". /// /// TODO: It would really be nice to make _Pragma just be a wrapper around a /// normal lexer that remaps tokens as they fly by. This would require making /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer /// interface that could handle this stuff. This would pull GetMappedTokenLoc /// out of the critical path of the lexer! /// Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP) { … } void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { … } template <typename T> static void StringifyImpl(T &Str, char Quote) { … } std::string Lexer::Stringify(StringRef Str, bool Charify) { … } void Lexer::Stringify(SmallVectorImpl<char> &Str) { … } //===----------------------------------------------------------------------===// // Token Spelling //===----------------------------------------------------------------------===// /// Slow case of getSpelling. Extract the characters comprising the /// spelling of this token from the provided input buffer. static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling) { … } /// getSpelling() - Return the 'spelling' of this token. The spelling of a /// token are the characters used to represent the token in the source file /// after trigraph expansion and escaped-newline folding. In particular, this /// wants to get the true, uncanonicalized, spelling of things like digraphs /// UCNs, etc. StringRef Lexer::getSpelling(SourceLocation loc, SmallVectorImpl<char> &buffer, const SourceManager &SM, const LangOptions &options, bool *invalid) { … } /// getSpelling() - Return the 'spelling' of this token. The spelling of a /// token are the characters used to represent the token in the source file /// after trigraph expansion and escaped-newline folding. In particular, this /// wants to get the true, uncanonicalized, spelling of things like digraphs /// UCNs, etc. std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { … } /// getSpelling - This method is used to get the spelling of a token into a /// preallocated buffer, instead of as an std::string. The caller is required /// to allocate enough space for the token, which is guaranteed to be at least /// Tok.getLength() bytes long. The actual length of the token is returned. /// /// Note that this method may do two possible things: it may either fill in /// the buffer specified with characters, or it may *change the input pointer* /// to point to a constant buffer with the data already in it (avoiding a /// copy). The caller is not allowed to modify the returned buffer pointer /// if an internal buffer is returned. unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { … } /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes /// that are part of that. unsigned Lexer::MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } /// Relex the token at the specified location. /// \returns true if there was a failure, false on success. bool Lexer::getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace) { … } /// Returns the pointer that points to the beginning of line that contains /// the given offset, or null if the offset if invalid. static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { … } static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } namespace { enum PreambleDirectiveKind { … }; } // namespace PreambleBounds Lexer::ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines) { … } unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts) { … } /// Computes the source location just past the end of the /// token at this source location. /// /// This routine can be used to produce a source location that /// points just past the end of the token referenced by \p Loc, and /// is generally used when a diagnostic needs to point just after a /// token where it expected something different that it received. If /// the returned source location would not be meaningful (e.g., if /// it points into a macro), this routine returns an invalid /// source location. /// /// \param Offset an offset from the end of the token, where the source /// location should refer to. The default offset (0) produces a source /// location pointing just past the end of the token; an offset of 1 produces /// a source location pointing to the last character in the token, etc. SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts) { … } /// Returns true if the given MacroID location points at the first /// token of the macro expansion. bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin) { … } /// Returns true if the given MacroID location points at the last /// token of the macro expansion. bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd) { … } static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts) { … } // Assumes that `Loc` is in an expansion. static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM) { … } CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts) { … } StringRef Lexer::getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid) { … } StringRef Lexer::getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } StringRef Lexer::getImmediateMacroNameForDiagnostics( SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { … } bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { … } StringRef Lexer::getIndentationForLine(SourceLocation Loc, const SourceManager &SM) { … } //===----------------------------------------------------------------------===// // Diagnostics forwarding code. //===----------------------------------------------------------------------===// /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the /// lexer buffer was all expanded at a single point, perform the mapping. /// This is currently only used for _Pragma implementation, so it is the slow /// path of the hot getSourceLocation method. Do not allow it to be inlined. static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); static SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen) { … } /// getSourceLocation - Return a source location identifier for the specified /// offset in the current file. SourceLocation Lexer::getSourceLocation(const char *Loc, unsigned TokLen) const { … } /// Diag - Forwarding function for diagnostics. This translate a source /// position in the current buffer into a SourceLocation object for rendering. DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { … } //===----------------------------------------------------------------------===// // Trigraph and Escaped Newline Handling Code. //===----------------------------------------------------------------------===// /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. static char GetTrigraphCharForLetter(char Letter) { … } /// DecodeTrigraphChar - If the specified character is a legal trigraph when /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, /// return the result character. Finally, emit a warning about trigraph use /// whether trigraphs are enabled or not. static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { … } /// getEscapedNewLineSize - Return the size of the specified escaped newline, /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a /// trigraph equivalent on entry to this function. unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { … } /// SkipEscapedNewLines - If P points to an escaped newline (or a series of /// them), skip over them and return the first non-escaped-newline found, /// otherwise return P. const char *Lexer::SkipEscapedNewLines(const char *P) { … } std::optional<Token> Lexer::findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { … } /// Checks that the given token is the first token that occurs after the /// given location (this excludes comments and whitespace). Returns the location /// immediately after the specified token. If the token is not found or the /// location is inside a macro, the returned source location will be invalid. SourceLocation Lexer::findLocationAfterToken( SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { … } /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, /// get its size, and return it. This is tricky in several cases: /// 1. If currently at the start of a trigraph, we warn about the trigraph, /// then either return the trigraph (skipping 3 chars) or the '?', /// depending on whether trigraphs are enabled or not. /// 2. If this is an escaped newline (potentially with whitespace between /// the backslash and newline), implicitly skip the newline and return /// the char after it. /// /// This handles the slow/uncommon case of the getCharAndSize method. Here we /// know that we can accumulate into Size, and that we have already incremented /// Ptr by Size bytes. /// /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should /// be updated to match. Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { … } /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, /// and that we have already incremented Ptr by Size bytes. /// /// NOTE: When this method is updated, getCharAndSizeSlow (above) should /// be updated to match. Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, const LangOptions &LangOpts) { … } //===----------------------------------------------------------------------===// // Helper methods for lexing. //===----------------------------------------------------------------------===// /// Routine that indiscriminately sets the offset into the source file. void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { … } static bool isUnicodeWhitespace(uint32_t Codepoint) { … } static llvm::SmallString<5> codepointAsHexString(uint32_t C) { … } // To mitigate https://github.com/llvm/llvm-project/issues/54732, // we allow "Mathematical Notation Characters" in identifiers. // This is a proposed profile that extends the XID_Start/XID_continue // with mathematical symbols, superscipts and subscripts digits // found in some production software. // https://www.unicode.org/L2/L2022/22230-math-profile.pdf static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension) { … } static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension) { … } static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension) { … } static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range) { … } static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End) { … } static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst) { … } /// After encountering UTF-8 character C and interpreting it as an identifier /// character, check whether it's a homoglyph for a common non-identifier /// source character that is unlikely to be an intentional identifier /// character and warn if so. static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range) { … } static void diagnoseInvalidUnicodeCodepointInIdentifier( DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst) { … } bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, Token &Result) { … } bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { … } bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr) { … } static const char * fastParseASCIIIdentifier(const char *CurPtr, [[maybe_unused]] const char *BufferEnd) { … } bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { … } /// isHexaLiteral - Return true if Start points to a hex constant. /// in microsoft mode (where this is supposed to be several different tokens). bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { … } /// LexNumericConstant - Lex the remainder of a integer or floating point /// constant. From[-1] is the first character lexed. Return the end of the /// constant. bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { … } /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes /// in C++11, or warn on a ud-suffix in C++98. const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, bool IsStringLiteral) { … } /// LexStringLiteral - Lex the remainder of a string literal, after having lexed /// either " or L" or u8" or u" or U". bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, tok::TokenKind Kind) { … } /// LexRawStringLiteral - Lex the remainder of a raw string literal, after /// having lexed R", LR", u8R", uR", or UR". bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, tok::TokenKind Kind) { … } /// LexAngledStringLiteral - Lex the remainder of an angled string literal, /// after having lexed the '<' character. This is used for #include filenames. bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { … } void Lexer::codeCompleteIncludedFile(const char *PathStart, const char *CompletionPoint, bool IsAngled) { … } /// LexCharConstant - Lex the remainder of a character constant, after having /// lexed either ' or L' or u8' or u' or U'. bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, tok::TokenKind Kind) { … } /// SkipWhitespace - Efficiently skip over a series of whitespace characters. /// Update BufferPtr to point to the next non-whitespace character and return. /// /// This method forms a token and returns true if KeepWhitespaceMode is enabled. bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, bool &TokAtPhysicalStartOfLine) { … } /// We have just read the // characters from input. Skip until we find the /// newline character that terminates the comment. Then update BufferPtr and /// return. /// /// If we're in KeepCommentMode or any CommentHandler has inserted /// some tokens, this will store the first token and return true. bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, bool &TokAtPhysicalStartOfLine) { … } /// If in save-comment mode, package up this Line comment in an appropriate /// way and return it. bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { … } /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline /// character (either \\n or \\r) is part of an escaped newline sequence. Issue /// a diagnostic if so. We know that the newline is inside of a block comment. static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs) { … } #ifdef __SSE2__ #include <emmintrin.h> #elif __ALTIVEC__ #include <altivec.h> #undef bool #endif /// We have just read from input the / and * characters that started a comment. /// Read until we find the * and / characters that terminate the comment. /// Note that we don't bother decoding trigraphs or escaped newlines in block /// comments, because they cannot cause the comment to end. The only thing /// that can happen is the comment could end with an escaped newline between /// the terminating * and /. /// /// If we're in KeepCommentMode or any CommentHandler has inserted /// some tokens, this will store the first token and return true. bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, bool &TokAtPhysicalStartOfLine) { … } //===----------------------------------------------------------------------===// // Primary Lexing Entry Points //===----------------------------------------------------------------------===// /// ReadToEndOfLine - Read the rest of the current preprocessor line as an /// uninterpreted string. This switches the lexer out of directive mode. void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { … } /// LexEndOfFile - CurPtr points to the end of this file. Handle this /// condition, reporting diagnostics and handling other edge cases as required. /// This returns true if Result contains a token, false if PP.Lex should be /// called again. bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { … } /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from /// the specified lexer will return a tok::l_paren token, 0 if it is something /// else and 2 if there are no more tokens in the buffer controlled by the /// lexer. unsigned Lexer::isNextPPTokenLParen() { … } /// Find the end of a version control conflict marker. static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK) { … } /// IsStartOfConflictMarker - If the specified pointer is the start of a version /// control conflict marker like '<<<<<<<', recognize it as such, emit an error /// and recover nicely. This returns true if it is a conflict marker and false /// if not. bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { … } /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it /// is the end of a conflict marker. Handle it by ignoring up until the end of /// the line. This returns true if it is a conflict marker and false if not. bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { … } static const char *findPlaceholderEnd(const char *CurPtr, const char *BufferEnd) { … } bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { … } bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { … } std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result) { … } std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, const char *SlashLoc, Token *Result) { … } uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result) { … } bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr) { … } void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { … } bool Lexer::Lex(Token &Result) { … } /// LexTokenInternal - This implements a simple C family lexer. It is an /// extremely performance critical piece of code. This assumes that the buffer /// has a null character at the end of the file. This returns a preprocessing /// token, not a normal token, as such, it is an internal interface. It assumes /// that the Flags of result have been cleared before calling this. bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { … } const char *Lexer::convertDependencyDirectiveToken( const dependency_directives_scan::Token &DDTok, Token &Result) { … } bool Lexer::LexDependencyDirectiveToken(Token &Result) { … } bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { … }