SourceCode.cpp | Explore in Territory

//===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "SourceCode.h"

#include "FuzzyMatch.h"
#include "Preamble.h"
#include "Protocol.h"
#include "support/Context.h"
#include "support/Logger.h"
#include "clang/Basic/FileEntry.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Driver/Types.h"
#include "clang/Format/Format.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/Token.h"
#include "clang/Tooling/Core/Replacement.h"
#include "clang/Tooling/Syntax/Tokens.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LineIterator.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/xxhash.h"
#include <algorithm>
#include <cstddef>
#include <optional>
#include <string>
#include <vector>

namespace clang {
namespace clangd {

// Here be dragons. LSP positions use columns measured in *UTF-16 code units*!
// Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.

// Iterates over unicode codepoints in the (UTF-8) string. For each,
// invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
// Returns true if CB returned true, false if we hit the end of string.
//
// If the string is not valid UTF-8, we log this error and "decode" the
// text in some arbitrary way. This is pretty sad, but this tends to happen deep
// within indexing of headers where clang misdetected the encoding, and
// propagating the error all the way back up is (probably?) not be worth it.
template <typename Callback>
static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { … }

// Returns the byte offset into the string that is an offset of \p Units in
// the specified encoding.
// Conceptually, this converts to the encoding, truncates to CodeUnits,
// converts back to UTF-8, and returns the length in bytes.
static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
                           bool &Valid) { … }

Key<OffsetEncoding> kCurrentOffsetEncoding;
static OffsetEncoding lspEncoding() { … }

// Like most strings in clangd, the input is UTF-8 encoded.
size_t lspLength(llvm::StringRef Code) { … }

llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
                                        bool AllowColumnsBeyondLineLength) { … }

Position offsetToPosition(llvm::StringRef Code, size_t Offset) { … }

Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { … }

bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) { … }

bool isValidFileRange(const SourceManager &Mgr, SourceRange R) { … }

SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM) { … }

static unsigned getTokenLengthAtLoc(SourceLocation Loc, const SourceManager &SM,
                                    const LangOptions &LangOpts) { … }

// Returns location of the last character of the token at a given loc
static SourceLocation getLocForTokenEnd(SourceLocation BeginLoc,
                                        const SourceManager &SM,
                                        const LangOptions &LangOpts) { … }

// Returns location of the starting of the token at a given EndLoc
static SourceLocation getLocForTokenBegin(SourceLocation EndLoc,
                                          const SourceManager &SM,
                                          const LangOptions &LangOpts) { … }

// Converts a char source range to a token range.
static SourceRange toTokenRange(CharSourceRange Range, const SourceManager &SM,
                                const LangOptions &LangOpts) { … }
// Returns the union of two token ranges.
// To find the maximum of the Ends of the ranges, we compare the location of the
// last character of the token.
static SourceRange unionTokenRange(SourceRange R1, SourceRange R2,
                                   const SourceManager &SM,
                                   const LangOptions &LangOpts) { … }

// Given a range whose endpoints may be in different expansions or files,
// tries to find a range within a common file by following up the expansion and
// include location in each.
static SourceRange rangeInCommonFile(SourceRange R, const SourceManager &SM,
                                     const LangOptions &LangOpts) { … }

// Find an expansion range (not necessarily immediate) the ends of which are in
// the same file id.
static SourceRange
getExpansionTokenRangeInSameFile(SourceLocation Loc, const SourceManager &SM,
                                 const LangOptions &LangOpts) { … }

// Returns the file range for a given Location as a Token Range
// This is quite similar to getFileLoc in SourceManager as both use
// getImmediateExpansionRange and getImmediateSpellingLoc (for macro IDs).
// However:
// - We want to maintain the full range information as we move from one file to
//   the next. getFileLoc only uses the BeginLoc of getImmediateExpansionRange.
// - We want to split '>>' tokens as the lexer parses the '>>' in nested
//   template instantiations as a '>>' instead of two '>'s.
// There is also getExpansionRange but it simply calls
// getImmediateExpansionRange on the begin and ends separately which is wrong.
static SourceRange getTokenFileRange(SourceLocation Loc,
                                     const SourceManager &SM,
                                     const LangOptions &LangOpts) { … }

bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM) { … }

std::optional<SourceRange> toHalfOpenFileRange(const SourceManager &SM,
                                               const LangOptions &LangOpts,
                                               SourceRange R) { … }

llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { … }

llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM,
                                                        Position P) { … }

Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { … }

void unionRanges(Range &A, Range B) { … }

std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code,
                                                  size_t Offset) { … }

std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { … }

TextEdit replacementToEdit(llvm::StringRef Code,
                           const tooling::Replacement &R) { … }

std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code,
                                          const tooling::Replacements &Repls) { … }

std::optional<std::string> getCanonicalPath(const FileEntryRef F,
                                            FileManager &FileMgr) { … }

TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M,
                    const LangOptions &L) { … }

FileDigest digest(llvm::StringRef Content) { … }

std::optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { … }

format::FormatStyle getFormatStyleForFile(llvm::StringRef File,
                                          llvm::StringRef Content,
                                          const ThreadsafeFS &TFS,
                                          bool FormatFile) { … }

llvm::Expected<tooling::Replacements>
cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces,
                 const format::FormatStyle &Style) { … }

static void
lex(llvm::StringRef Code, const LangOptions &LangOpts,
    llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)>
        Action) { … }

llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content,
                                             const format::FormatStyle &Style) { … }

std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier,
                                           llvm::StringRef Content,
                                           const LangOptions &LangOpts) { … }

bool isKeyword(llvm::StringRef NewName, const LangOptions &LangOpts) { … }

namespace {
struct NamespaceEvent { … };
// Scans C++ source code for constructs that change the visible namespaces.
void parseNamespaceEvents(llvm::StringRef Code, const LangOptions &LangOpts,
                          llvm::function_ref<void(NamespaceEvent)> Callback) { … }

// Returns the prefix namespaces of NS: {"" ... NS}.
llvm::SmallVector<llvm::StringRef> ancestorNamespaces(llvm::StringRef NS) { … }

// Checks whether \p FileName is a valid spelling of main file.
bool isMainFile(llvm::StringRef FileName, const SourceManager &SM) { … }

} // namespace

std::vector<std::string> visibleNamespaces(llvm::StringRef Code,
                                           const LangOptions &LangOpts) { … }

llvm::StringSet<> collectWords(llvm::StringRef Content) { … }

static bool isLikelyIdentifier(llvm::StringRef Word, llvm::StringRef Before,
                               llvm::StringRef After) { … }

std::optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc,
                                                 const syntax::TokenBuffer &TB,
                                                 const LangOptions &LangOpts) { … }

std::optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok,
                                          Preprocessor &PP) { … }

llvm::Expected<std::string> Edit::apply() const { … }

std::vector<TextEdit> Edit::asTextEdits() const { … }

bool Edit::canApplyTo(llvm::StringRef Code) const { … }

llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { … }

// Workaround for editors that have buggy handling of newlines at end of file.
//
// The editor is supposed to expose document contents over LSP as an exact
// string, with whitespace and newlines well-defined. But internally many
// editors treat text as an array of lines, and there can be ambiguity over
// whether the last line ends with a newline or not.
//
// This confusion can lead to incorrect edits being sent. Failing to apply them
// is catastrophic: we're desynced, LSP has no mechanism to get back in sync.
// We apply a heuristic to avoid this state.
//
// If our current view of an N-line file does *not* end in a newline, but the
// editor refers to the start of the next line (an impossible location), then
// we silently add a newline to make this valid.
// We will still validate that the rangeLength is correct, *including* the
// inferred newline.
//
// See https://github.com/neovim/neovim/issues/17085
static void inferFinalNewline(llvm::Expected<size_t> &Err,
                              std::string &Contents, const Position &Pos) { … }

llvm::Error applyChange(std::string &Contents,
                        const TextDocumentContentChangeEvent &Change) { … }

EligibleRegion getEligiblePoints(llvm::StringRef Code,
                                 llvm::StringRef FullyQualifiedName,
                                 const LangOptions &LangOpts) { … }

bool isHeaderFile(llvm::StringRef FileName,
                  std::optional<LangOptions> LangOpts) { … }

bool isProtoFile(SourceLocation Loc, const SourceManager &SM) { … }

SourceLocation translatePreamblePatchLocation(SourceLocation Loc,
                                              const SourceManager &SM) { … }

clangd::Range rangeTillEOL(llvm::StringRef Code, unsigned HashOffset) { … }
} // namespace clangd
} // namespace clang
llvm/clang-tools-extra/clangd/SourceCode.cpp