InputFiles.cpp | Explore in Territory

//===- InputFiles.cpp -----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to parse Mach-O object files. In this comment,
// we describe the Mach-O file structure and how we parse it.
//
// Mach-O is not very different from ELF or COFF. The notion of symbols,
// sections and relocations exists in Mach-O as it does in ELF and COFF.
//
// Perhaps the notion that is new to those who know ELF/COFF is "subsections".
// In ELF/COFF, sections are an atomic unit of data copied from input files to
// output files. When we merge or garbage-collect sections, we treat each
// section as an atomic unit. In Mach-O, that's not the case. Sections can
// consist of multiple subsections, and subsections are a unit of merging and
// garbage-collecting. Therefore, Mach-O's subsections are more similar to
// ELF/COFF's sections than Mach-O's sections are.
//
// A section can have multiple symbols. A symbol that does not have the
// N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by
// definition, a symbol is always present at the beginning of each subsection. A
// symbol with N_ALT_ENTRY attribute does not start a new subsection and can
// point to a middle of a subsection.
//
// The notion of subsections also affects how relocations are represented in
// Mach-O. All references within a section need to be explicitly represented as
// relocations if they refer to different subsections, because we obviously need
// to fix up addresses if subsections are laid out in an output file differently
// than they were in object files. To represent that, Mach-O relocations can
// refer to an unnamed location via its address. Scattered relocations (those
// with the R_SCATTERED bit set) always refer to unnamed locations.
// Non-scattered relocations refer to an unnamed location if r_extern is not set
// and r_symbolnum is zero.
//
// Without the above differences, I think you can use your knowledge about ELF
// and COFF for Mach-O.
//
//===----------------------------------------------------------------------===//

#include "InputFiles.h"
#include "Config.h"
#include "Driver.h"
#include "Dwarf.h"
#include "EhFrame.h"
#include "ExportTrie.h"
#include "InputSection.h"
#include "MachOStructs.h"
#include "ObjC.h"
#include "OutputSection.h"
#include "OutputSegment.h"
#include "SymbolTable.h"
#include "Symbols.h"
#include "SyntheticSections.h"
#include "Target.h"

#include "lld/Common/CommonLinkerContext.h"
#include "lld/Common/DWARF.h"
#include "lld/Common/Reproduce.h"
#include "llvm/ADT/iterator.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/LTO/LTO.h"
#include "llvm/Support/BinaryStreamReader.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TarWriter.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/TextAPI/Architecture.h"
#include "llvm/TextAPI/InterfaceFile.h"

#include <optional>
#include <type_traits>

usingnamespacellvm;
usingnamespacellvm::MachO;
usingnamespacellvm::support::endian;
usingnamespacellvm::sys;
usingnamespacelld;
usingnamespacelld::macho;

// Returns "<internal>", "foo.a(bar.o)", or "baz.o".
std::string lld::toString(const InputFile *f) { … }

std::string lld::toString(const Section &sec) { … }

SetVector<InputFile *> macho::inputFiles;
std::unique_ptr<TarWriter> macho::tar;
int InputFile::idCount = …;

static VersionTuple decodeVersion(uint32_t version) { … }

static std::vector<PlatformInfo> getPlatformInfos(const InputFile *input) { … }

static bool checkCompatibility(const InputFile *input) { … }

template <class Header>
static bool compatWithTargetArch(const InputFile *file, const Header *hdr) { … }

// This cache mostly exists to store system libraries (and .tbds) as they're
// loaded, rather than the input archives, which are already cached at a higher
// level, and other files like the filelist that are only read once.
// Theoretically this caching could be more efficient by hoisting it, but that
// would require altering many callers to track the state.
DenseMap<CachedHashStringRef, MemoryBufferRef> macho::cachedReads;
// Open a given file path and return it as a memory-mapped file.
std::optional<MemoryBufferRef> macho::readFile(StringRef path) { … }

InputFile::InputFile(Kind kind, const InterfaceFile &interface)
    : … { … }

// Some sections comprise of fixed-size records, so instead of splitting them at
// symbol boundaries, we split them based on size. Records are distinct from
// literals in that they may contain references to other sections, instead of
// being leaf nodes in the InputSection graph.
//
// Note that "record" is a term I came up with. In contrast, "literal" is a term
// used by the Mach-O format.
static std::optional<size_t> getRecordSize(StringRef segname, StringRef name) { … }

static Error parseCallGraph(ArrayRef<uint8_t> data,
                            std::vector<CallGraphEntry> &callGraph) { … }

// Parse the sequence of sections within a single LC_SEGMENT(_64).
// Split each section into subsections.
template <class SectionHeader>
void ObjFile::parseSections(ArrayRef<SectionHeader> sectionHeaders) { … }

void ObjFile::splitEhFrames(ArrayRef<uint8_t> data, Section &ehFrameSection) { … }

template <class T>
static Section *findContainingSection(const std::vector<Section *> &sections,
                                      T *offset) { … }

// Find the subsection corresponding to the greatest section offset that is <=
// that of the given offset.
//
// offset: an offset relative to the start of the original InputSection (before
// any subsection splitting has occurred). It will be updated to represent the
// same location as an offset relative to the start of the containing
// subsection.
template <class T>
static InputSection *findContainingSubsection(const Section &section,
                                              T *offset) { … }

// Find a symbol at offset `off` within `isec`.
static Defined *findSymbolAtOffset(const ConcatInputSection *isec,
                                   uint64_t off) { … }

template <class SectionHeader>
static bool validateRelocationInfo(InputFile *file, const SectionHeader &sec,
                                   relocation_info rel) { … }

template <class SectionHeader>
void ObjFile::parseRelocations(ArrayRef<SectionHeader> sectionHeaders,
                               const SectionHeader &sec, Section &section) { … }

template <class NList>
static macho::Symbol *createDefined(const NList &sym, StringRef name,
                                    InputSection *isec, uint64_t value,
                                    uint64_t size, bool forceHidden) { … }

// Absolute symbols are defined symbols that do not have an associated
// InputSection. They cannot be weak.
template <class NList>
static macho::Symbol *createAbsolute(const NList &sym, InputFile *file,
                                     StringRef name, bool forceHidden) { … }

template <class NList>
macho::Symbol *ObjFile::parseNonSectionSymbol(const NList &sym,
                                              const char *strtab) { … }

template <class NList> static bool isUndef(const NList &sym) { … }

template <class LP>
void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders,
                           ArrayRef<typename LP::nlist> nList,
                           const char *strtab, bool subsectionsViaSymbols) { … }

OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName,
                       StringRef sectName)
    : … { … }

template <class LP>
void ObjFile::parseLinkerOptions(SmallVectorImpl<StringRef> &LCLinkerOptions) { … }

SmallVector<StringRef> macho::unprocessedLCLinkerOptions;
ObjFile::ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName,
                 bool lazy, bool forceHidden, bool compatArch,
                 bool builtFromBitcode)
    : … { … }

template <class LP> void ObjFile::parse() { … }

template <class LP> void ObjFile::parseLazy() { … }

void ObjFile::parseDebugInfo() { … }

ArrayRef<data_in_code_entry> ObjFile::getDataInCode() const { … }

ArrayRef<uint8_t> ObjFile::getOptimizationHints() const { … }

// Create pointers from symbols to their associated compact unwind entries.
void ObjFile::registerCompactUnwind(Section &compactUnwindSection) { … }

struct CIE { … };

static uint8_t pointerEncodingToSize(uint8_t enc) { … }

static CIE parseCIE(const InputSection *isec, const EhReader &reader,
                    size_t off) { … }

// EH frame target addresses may be encoded as pcrel offsets. However, instead
// of using an actual pcrel reloc, ld64 emits subtractor relocations instead.
// This function recovers the target address from the subtractors, essentially
// performing the inverse operation of EhRelocator.
//
// Concretely, we expect our relocations to write the value of `PC -
// target_addr` to `PC`. `PC` itself is denoted by a minuend relocation that
// points to a symbol plus an addend.
//
// It is important that the minuend relocation point to a symbol within the
// same section as the fixup value, since sections may get moved around.
//
// For example, for arm64, llvm-mc emits relocations for the target function
// address like so:
//
//   ltmp:
//     <CIE start>
//     ...
//     <CIE end>
//     ... multiple FDEs ...
//     <FDE start>
//     <target function address - (ltmp + pcrel offset)>
//     ...
//
// If any of the FDEs in `multiple FDEs` get dead-stripped, then `FDE start`
// will move to an earlier address, and `ltmp + pcrel offset` will no longer
// reflect an accurate pcrel value. To avoid this problem, we "canonicalize"
// our relocation by adding an `EH_Frame` symbol at `FDE start`, and updating
// the reloc to be `target function address - (EH_Frame + new pcrel offset)`.
//
// If `Invert` is set, then we instead expect `target_addr - PC` to be written
// to `PC`.
template <bool Invert = false>
Defined *
targetSymFromCanonicalSubtractor(const InputSection *isec,
                                 std::vector<macho::Reloc>::iterator relocIt) { … }

Defined *findSymbolAtAddress(const std::vector<Section *> &sections,
                             uint64_t addr) { … }

// For symbols that don't have compact unwind info, associate them with the more
// general-purpose (and verbose) DWARF unwind info found in __eh_frame.
//
// This requires us to parse the contents of __eh_frame. See EhFrame.h for a
// description of its format.
//
// While parsing, we also look for what MC calls "abs-ified" relocations -- they
// are relocations which are implicitly encoded as offsets in the section data.
// We convert them into explicit Reloc structs so that the EH frames can be
// handled just like a regular ConcatInputSection later in our output phase.
//
// We also need to handle the case where our input object file has explicit
// relocations. This is the case when e.g. it's the output of `ld -r`. We only
// look for the "abs-ified" relocation if an explicit relocation is absent.
void ObjFile::registerEhFrames(Section &ehFrameSection) { … }

std::string ObjFile::sourceFile() const { … }

lld::DWARFCache *ObjFile::getDwarf() { … }
// The path can point to either a dylib or a .tbd file.
static DylibFile *loadDylib(StringRef path, DylibFile *umbrella) { … }

// TBD files are parsed into a series of TAPI documents (InterfaceFiles), with
// the first document storing child pointers to the rest of them. When we are
// processing a given TBD file, we store that top-level document in
// currentTopLevelTapi. When processing re-exports, we search its children for
// potentially matching documents in the same TBD file. Note that the children
// themselves don't point to further documents, i.e. this is a two-level tree.
//
// Re-exports can either refer to on-disk files, or to documents within .tbd
// files.
static DylibFile *findDylib(StringRef path, DylibFile *umbrella,
                            const InterfaceFile *currentTopLevelTapi) { … }

// If a re-exported dylib is public (lives in /usr/lib or
// /System/Library/Frameworks), then it is considered implicitly linked: we
// should bind to its symbols directly instead of via the re-exporting umbrella
// library.
static bool isImplicitlyLinked(StringRef path) { … }

void DylibFile::loadReexport(StringRef path, DylibFile *umbrella,
                         const InterfaceFile *currentTopLevelTapi) { … }

DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella,
                     bool isBundleLoader, bool explicitlyLinked)
    : … { … }

void DylibFile::parseExportedSymbols(uint32_t offset, uint32_t size) { … }

void DylibFile::parseLoadCommands(MemoryBufferRef mb) { … }

// Some versions of Xcode ship with .tbd files that don't have the right
// platform settings.
constexpr std::array<StringRef, 3> skipPlatformChecks{ … };

static bool skipPlatformCheckForCatalyst(const InterfaceFile &interface,
                                         bool explicitlyLinked) { … }

static bool isArchABICompatible(ArchitectureSet archSet,
                                Architecture targetArch) { … }

static bool isTargetPlatformArchCompatible(
    InterfaceFile::const_target_range interfaceTargets, Target target) { … }

DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella,
                     bool isBundleLoader, bool explicitlyLinked)
    : … { … }

DylibFile::DylibFile(DylibFile *umbrella)
    : … { … }

void DylibFile::parseReexports(const InterfaceFile &interface) { … }

bool DylibFile::isExplicitlyLinked() const { … }

DylibFile *DylibFile::getSyntheticDylib(StringRef installName,
                                        uint32_t currentVersion,
                                        uint32_t compatVersion) { … }

// $ld$ symbols modify the properties/behavior of the library (e.g. its install
// name, compatibility version or hide/add symbols) for specific target
// versions.
bool DylibFile::handleLDSymbol(StringRef originalName) { … }

void DylibFile::handleLDPreviousSymbol(StringRef name, StringRef originalName) { … }

void DylibFile::handleLDInstallNameSymbol(StringRef name,
                                          StringRef originalName) { … }

void DylibFile::handleLDHideSymbol(StringRef name, StringRef originalName) { … }

void DylibFile::checkAppExtensionSafety(bool dylibIsAppExtensionSafe) const { … }

ArchiveFile::ArchiveFile(std::unique_ptr<object::Archive> &&f, bool forceHidden)
    : … { … }

void ArchiveFile::addLazySymbols() { … }

static Expected<InputFile *>
loadArchiveMember(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName,
                  uint64_t offsetInArchive, bool forceHidden, bool compatArch) { … }

Error ArchiveFile::fetch(const object::Archive::Child &c, StringRef reason) { … }

void ArchiveFile::fetch(const object::Archive::Symbol &sym) { … }

static macho::Symbol *createBitcodeSymbol(const lto::InputFile::Symbol &objSym,
                                          BitcodeFile &file) { … }

BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
                         uint64_t offsetInArchive, bool lazy, bool forceHidden,
                         bool compatArch)
    : … { … }

void BitcodeFile::parse() { … }

void BitcodeFile::parseLazy() { … }

std::string macho::replaceThinLTOSuffix(StringRef path) { … }

void macho::extract(InputFile &file, StringRef reason) { … }

template void ObjFile::parse<LP64>();
llvm/lld/MachO/InputFiles.cpp