LiteralSupport.cpp | Explore in Territory

//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the NumericLiteralParser, CharLiteralParser, and
// StringLiteralParser interfaces.
//
//===----------------------------------------------------------------------===//

#include "clang/Lex/LiteralSupport.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/Token.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Unicode.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <string>

usingnamespaceclang;

static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { … }

static unsigned getEncodingPrefixLen(tok::TokenKind kind) { … }

static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
                                           FullSourceLoc TokLoc,
                                           const char *TokBegin,
                                           const char *TokRangeBegin,
                                           const char *TokRangeEnd) { … }

/// Produce a diagnostic highlighting some portion of a literal.
///
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
                              const LangOptions &Features, FullSourceLoc TokLoc,
                              const char *TokBegin, const char *TokRangeBegin,
                              const char *TokRangeEnd, unsigned DiagID) { … }

static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) { … }

/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *ThisTokBegin,
                                  const char *&ThisTokBuf,
                                  const char *ThisTokEnd, bool &HadError,
                                  FullSourceLoc Loc, unsigned CharWidth,
                                  DiagnosticsEngine *Diags,
                                  const LangOptions &Features,
                                  StringLiteralEvalMethod EvalMethod) { … }

static void appendCodePoint(unsigned Codepoint,
                            llvm::SmallVectorImpl<char> &Str) { … }

void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { … }

bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
                                              const LangOptions &LO) { … }

bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) { … }

static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
                                    const char *&ThisTokBuf,
                                    const char *ThisTokEnd, uint32_t &UcnVal,
                                    unsigned short &UcnLen, bool &Delimited,
                                    FullSourceLoc Loc, DiagnosticsEngine *Diags,
                                    const LangOptions &Features,
                                    bool in_char_string_literal = false) { … }

static void DiagnoseInvalidUnicodeCharacterName(
    DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
    const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
    llvm::StringRef Name) { … }

static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
                                  const char *&ThisTokBuf,
                                  const char *ThisTokEnd, uint32_t &UcnVal,
                                  unsigned short &UcnLen, FullSourceLoc Loc,
                                  DiagnosticsEngine *Diags,
                                  const LangOptions &Features) { … }

/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                             const char *ThisTokEnd, uint32_t &UcnVal,
                             unsigned short &UcnLen, FullSourceLoc Loc,
                             DiagnosticsEngine *Diags,
                             const LangOptions &Features,
                             bool in_char_string_literal = false) { … }

/// MeasureUCNEscape - Determine the number of bytes within the resulting string
/// which this UCN will occupy.
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                            const char *ThisTokEnd, unsigned CharByteWidth,
                            const LangOptions &Features, bool &HadError) { … }

/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
/// we will likely rework our support for UCN's.
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                            const char *ThisTokEnd,
                            char *&ResultBuf, bool &HadError,
                            FullSourceLoc Loc, unsigned CharByteWidth,
                            DiagnosticsEngine *Diags,
                            const LangOptions &Features) { … }

///       integer-constant: [C99 6.4.4.1]
///         decimal-constant integer-suffix
///         octal-constant integer-suffix
///         hexadecimal-constant integer-suffix
///         binary-literal integer-suffix [GNU, C++1y]
///       user-defined-integer-literal: [C++11 lex.ext]
///         decimal-literal ud-suffix
///         octal-literal ud-suffix
///         hexadecimal-literal ud-suffix
///         binary-literal ud-suffix [GNU, C++1y]
///       decimal-constant:
///         nonzero-digit
///         decimal-constant digit
///       octal-constant:
///         0
///         octal-constant octal-digit
///       hexadecimal-constant:
///         hexadecimal-prefix hexadecimal-digit
///         hexadecimal-constant hexadecimal-digit
///       hexadecimal-prefix: one of
///         0x 0X
///       binary-literal:
///         0b binary-digit
///         0B binary-digit
///         binary-literal binary-digit
///       integer-suffix:
///         unsigned-suffix [long-suffix]
///         unsigned-suffix [long-long-suffix]
///         long-suffix [unsigned-suffix]
///         long-long-suffix [unsigned-sufix]
///       nonzero-digit:
///         1 2 3 4 5 6 7 8 9
///       octal-digit:
///         0 1 2 3 4 5 6 7
///       hexadecimal-digit:
///         0 1 2 3 4 5 6 7 8 9
///         a b c d e f
///         A B C D E F
///       binary-digit:
///         0
///         1
///       unsigned-suffix: one of
///         u U
///       long-suffix: one of
///         l L
///       long-long-suffix: one of
///         ll LL
///
///       floating-constant: [C99 6.4.4.2]
///         TODO: add rules...
///
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
                                           SourceLocation TokLoc,
                                           const SourceManager &SM,
                                           const LangOptions &LangOpts,
                                           const TargetInfo &Target,
                                           DiagnosticsEngine &Diags)
    : … { … }

/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
/// numbers. It issues an error for illegal digits, and handles floating point
/// parsing. If it detects a floating point number, the radix is set to 10.
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ … }

/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
/// treat it as an invalid suffix.
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
                                           StringRef Suffix) { … }

void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
                                          const char *Pos,
                                          CheckSeparatorKind IsAfterDigits) { … }

/// ParseNumberStartingWithZero - This method is called when the first character
/// of the number is found to be a zero.  This means it is either an octal
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
/// a floating point number (01239.123e4).  Eat the prefix, determining the
/// radix etc.
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { … }

static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { … }

/// GetIntegerValue - Convert this numeric literal value to an APInt that
/// matches Val's input width.  If there is an overflow, set Val to the low bits
/// of the result and return true.  Otherwise, return false.
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { … }

llvm::APFloat::opStatus
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
                                    llvm::RoundingMode RM) { … }

static inline bool IsExponentPart(char c, bool isHex) { … }

bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { … }

/// \verbatim
///       user-defined-character-literal: [C++11 lex.ext]
///         character-literal ud-suffix
///       ud-suffix:
///         identifier
///       character-literal: [C++11 lex.ccon]
///         ' c-char-sequence '
///         u' c-char-sequence '
///         U' c-char-sequence '
///         L' c-char-sequence '
///         u8' c-char-sequence ' [C++1z lex.ccon]
///       c-char-sequence:
///         c-char
///         c-char-sequence c-char
///       c-char:
///         any member of the source character set except the single-quote ',
///           backslash \, or new-line character
///         escape-sequence
///         universal-character-name
///       escape-sequence:
///         simple-escape-sequence
///         octal-escape-sequence
///         hexadecimal-escape-sequence
///       simple-escape-sequence:
///         one of \' \" \? \\ \a \b \f \n \r \t \v
///       octal-escape-sequence:
///         \ octal-digit
///         \ octal-digit octal-digit
///         \ octal-digit octal-digit octal-digit
///       hexadecimal-escape-sequence:
///         \x hexadecimal-digit
///         hexadecimal-escape-sequence hexadecimal-digit
///       universal-character-name: [C++11 lex.charset]
///         \u hex-quad
///         \U hex-quad hex-quad
///       hex-quad:
///         hex-digit hex-digit hex-digit hex-digit
/// \endverbatim
///
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
                                     SourceLocation Loc, Preprocessor &PP,
                                     tok::TokenKind kind) { … }

/// \verbatim
///       string-literal: [C++0x lex.string]
///         encoding-prefix " [s-char-sequence] "
///         encoding-prefix R raw-string
///       encoding-prefix:
///         u8
///         u
///         U
///         L
///       s-char-sequence:
///         s-char
///         s-char-sequence s-char
///       s-char:
///         any member of the source character set except the double-quote ",
///           backslash \, or new-line character
///         escape-sequence
///         universal-character-name
///       raw-string:
///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
///       r-char-sequence:
///         r-char
///         r-char-sequence r-char
///       r-char:
///         any member of the source character set, except a right parenthesis )
///           followed by the initial d-char-sequence (which may be empty)
///           followed by a double quote ".
///       d-char-sequence:
///         d-char
///         d-char-sequence d-char
///       d-char:
///         any member of the basic source character set except:
///           space, the left parenthesis (, the right parenthesis ),
///           the backslash \, and the control characters representing horizontal
///           tab, vertical tab, form feed, and newline.
///       escape-sequence: [C++0x lex.ccon]
///         simple-escape-sequence
///         octal-escape-sequence
///         hexadecimal-escape-sequence
///       simple-escape-sequence:
///         one of \' \" \? \\ \a \b \f \n \r \t \v
///       octal-escape-sequence:
///         \ octal-digit
///         \ octal-digit octal-digit
///         \ octal-digit octal-digit octal-digit
///       hexadecimal-escape-sequence:
///         \x hexadecimal-digit
///         hexadecimal-escape-sequence hexadecimal-digit
///       universal-character-name:
///         \u hex-quad
///         \U hex-quad hex-quad
///       hex-quad:
///         hex-digit hex-digit hex-digit hex-digit
/// \endverbatim
///
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
                                         Preprocessor &PP,
                                         StringLiteralEvalMethod EvalMethod)
    : … { … }

void StringLiteralParser::init(ArrayRef<Token> StringToks){ … }

static const char *resyncUTF8(const char *Err, const char *End) { … }

/// This function copies from Fragment, which is a sequence of bytes
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
/// Performs widening for multi-byte characters.
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
                                             const char *TokBegin,
                                             StringRef Fragment) { … }

void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { … }

/// getOffsetOfStringByte - This function returns the offset of the
/// specified byte of the string data represented by Token.  This handles
/// advancing over escape sequences in the string.
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
                                                    unsigned ByteNo) const { … }

/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
/// treat it as an invalid suffix.
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
                                          StringRef Suffix) { … }
llvm/clang/lib/Lex/LiteralSupport.cpp