//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the NumericLiteralParser, CharLiteralParser, and // StringLiteralParser interfaces. // //===----------------------------------------------------------------------===// #include "clang/Lex/LiteralSupport.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/TargetInfo.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/Token.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Unicode.h" #include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <cstring> #include <string> usingnamespaceclang; static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { … } static unsigned getEncodingPrefixLen(tok::TokenKind kind) { … } static CharSourceRange MakeCharSourceRange(const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd) { … } /// Produce a diagnostic highlighting some portion of a literal. /// /// Emits the diagnostic \p DiagID, highlighting the range of characters from /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be /// a substring of a spelling buffer for the token beginning at \p TokBegin. static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID) { … } static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) { … } /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, const LangOptions &Features, StringLiteralEvalMethod EvalMethod) { … } static void appendCodePoint(unsigned Codepoint, llvm::SmallVectorImpl<char> &Str) { … } void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { … } bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K, const LangOptions &LO) { … } bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) { … } static bool ProcessNumericUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, bool &Delimited, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features, bool in_char_string_literal = false) { … } static void DiagnoseInvalidUnicodeCharacterName( DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, llvm::StringRef Name) { … } static bool ProcessNamedUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features) { … } /// ProcessUCNEscape - Read the Universal Character Name, check constraints and /// return the UTF32. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features, bool in_char_string_literal = false) { … } /// MeasureUCNEscape - Determine the number of bytes within the resulting string /// which this UCN will occupy. static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, unsigned CharByteWidth, const LangOptions &Features, bool &HadError) { … } /// EncodeUCNEscape - Read the Universal Character Name, check constraints and /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of /// StringLiteralParser. When we decide to implement UCN's for identifiers, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, char *&ResultBuf, bool &HadError, FullSourceLoc Loc, unsigned CharByteWidth, DiagnosticsEngine *Diags, const LangOptions &Features) { … } /// integer-constant: [C99 6.4.4.1] /// decimal-constant integer-suffix /// octal-constant integer-suffix /// hexadecimal-constant integer-suffix /// binary-literal integer-suffix [GNU, C++1y] /// user-defined-integer-literal: [C++11 lex.ext] /// decimal-literal ud-suffix /// octal-literal ud-suffix /// hexadecimal-literal ud-suffix /// binary-literal ud-suffix [GNU, C++1y] /// decimal-constant: /// nonzero-digit /// decimal-constant digit /// octal-constant: /// 0 /// octal-constant octal-digit /// hexadecimal-constant: /// hexadecimal-prefix hexadecimal-digit /// hexadecimal-constant hexadecimal-digit /// hexadecimal-prefix: one of /// 0x 0X /// binary-literal: /// 0b binary-digit /// 0B binary-digit /// binary-literal binary-digit /// integer-suffix: /// unsigned-suffix [long-suffix] /// unsigned-suffix [long-long-suffix] /// long-suffix [unsigned-suffix] /// long-long-suffix [unsigned-sufix] /// nonzero-digit: /// 1 2 3 4 5 6 7 8 9 /// octal-digit: /// 0 1 2 3 4 5 6 7 /// hexadecimal-digit: /// 0 1 2 3 4 5 6 7 8 9 /// a b c d e f /// A B C D E F /// binary-digit: /// 0 /// 1 /// unsigned-suffix: one of /// u U /// long-suffix: one of /// l L /// long-long-suffix: one of /// ll LL /// /// floating-constant: [C99 6.4.4.2] /// TODO: add rules... /// NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, SourceLocation TokLoc, const SourceManager &SM, const LangOptions &LangOpts, const TargetInfo &Target, DiagnosticsEngine &Diags) : … { … } /// ParseDecimalOrOctalCommon - This method is called for decimal or octal /// numbers. It issues an error for illegal digits, and handles floating point /// parsing. If it detects a floating point number, the radix is set to 10. void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ … } /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved /// suffixes as ud-suffixes, because the diagnostic experience is better if we /// treat it as an invalid suffix. bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix) { … } void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, const char *Pos, CheckSeparatorKind IsAfterDigits) { … } /// ParseNumberStartingWithZero - This method is called when the first character /// of the number is found to be a zero. This means it is either an octal /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or /// a floating point number (01239.123e4). Eat the prefix, determining the /// radix etc. void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { … } static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { … } /// GetIntegerValue - Convert this numeric literal value to an APInt that /// matches Val's input width. If there is an overflow, set Val to the low bits /// of the result and return true. Otherwise, return false. bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { … } llvm::APFloat::opStatus NumericLiteralParser::GetFloatValue(llvm::APFloat &Result, llvm::RoundingMode RM) { … } static inline bool IsExponentPart(char c, bool isHex) { … } bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { … } /// \verbatim /// user-defined-character-literal: [C++11 lex.ext] /// character-literal ud-suffix /// ud-suffix: /// identifier /// character-literal: [C++11 lex.ccon] /// ' c-char-sequence ' /// u' c-char-sequence ' /// U' c-char-sequence ' /// L' c-char-sequence ' /// u8' c-char-sequence ' [C++1z lex.ccon] /// c-char-sequence: /// c-char /// c-char-sequence c-char /// c-char: /// any member of the source character set except the single-quote ', /// backslash \, or new-line character /// escape-sequence /// universal-character-name /// escape-sequence: /// simple-escape-sequence /// octal-escape-sequence /// hexadecimal-escape-sequence /// simple-escape-sequence: /// one of \' \" \? \\ \a \b \f \n \r \t \v /// octal-escape-sequence: /// \ octal-digit /// \ octal-digit octal-digit /// \ octal-digit octal-digit octal-digit /// hexadecimal-escape-sequence: /// \x hexadecimal-digit /// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: [C++11 lex.charset] /// \u hex-quad /// \U hex-quad hex-quad /// hex-quad: /// hex-digit hex-digit hex-digit hex-digit /// \endverbatim /// CharLiteralParser::CharLiteralParser(const char *begin, const char *end, SourceLocation Loc, Preprocessor &PP, tok::TokenKind kind) { … } /// \verbatim /// string-literal: [C++0x lex.string] /// encoding-prefix " [s-char-sequence] " /// encoding-prefix R raw-string /// encoding-prefix: /// u8 /// u /// U /// L /// s-char-sequence: /// s-char /// s-char-sequence s-char /// s-char: /// any member of the source character set except the double-quote ", /// backslash \, or new-line character /// escape-sequence /// universal-character-name /// raw-string: /// " d-char-sequence ( r-char-sequence ) d-char-sequence " /// r-char-sequence: /// r-char /// r-char-sequence r-char /// r-char: /// any member of the source character set, except a right parenthesis ) /// followed by the initial d-char-sequence (which may be empty) /// followed by a double quote ". /// d-char-sequence: /// d-char /// d-char-sequence d-char /// d-char: /// any member of the basic source character set except: /// space, the left parenthesis (, the right parenthesis ), /// the backslash \, and the control characters representing horizontal /// tab, vertical tab, form feed, and newline. /// escape-sequence: [C++0x lex.ccon] /// simple-escape-sequence /// octal-escape-sequence /// hexadecimal-escape-sequence /// simple-escape-sequence: /// one of \' \" \? \\ \a \b \f \n \r \t \v /// octal-escape-sequence: /// \ octal-digit /// \ octal-digit octal-digit /// \ octal-digit octal-digit octal-digit /// hexadecimal-escape-sequence: /// \x hexadecimal-digit /// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: /// \u hex-quad /// \U hex-quad hex-quad /// hex-quad: /// hex-digit hex-digit hex-digit hex-digit /// \endverbatim /// StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, StringLiteralEvalMethod EvalMethod) : … { … } void StringLiteralParser::init(ArrayRef<Token> StringToks){ … } static const char *resyncUTF8(const char *Err, const char *End) { … } /// This function copies from Fragment, which is a sequence of bytes /// within Tok's contents (which begin at TokBegin) into ResultPtr. /// Performs widening for multi-byte characters. bool StringLiteralParser::CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment) { … } void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { … } /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles /// advancing over escape sequences in the string. unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, unsigned ByteNo) const { … } /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved /// suffixes as ud-suffixes, because the diagnostic experience is better if we /// treat it as an invalid suffix. bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix) { … }