SyntheticSections.cpp | Explore in Territory

//===- SyntheticSections.cpp ---------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "SyntheticSections.h"
#include "ConcatOutputSection.h"
#include "Config.h"
#include "ExportTrie.h"
#include "InputFiles.h"
#include "MachOStructs.h"
#include "ObjC.h"
#include "OutputSegment.h"
#include "SymbolTable.h"
#include "Symbols.h"

#include "lld/Common/CommonLinkerContext.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/xxhash.h"

#if defined(__APPLE__)
#include <sys/mman.h>

#define COMMON_DIGEST_FOR_OPENSSL
#include <CommonCrypto/CommonDigest.h>
#else
#include "llvm/Support/SHA256.h"
#endif

usingnamespacellvm;
usingnamespacellvm::MachO;
usingnamespacellvm::support;
usingnamespacellvm::support::endian;
usingnamespacelld;
usingnamespacelld::macho;

// Reads `len` bytes at data and writes the 32-byte SHA256 checksum to `output`.
static void sha256(const uint8_t *data, size_t len, uint8_t *output) { … }

InStruct macho::in;
std::vector<SyntheticSection *> macho::syntheticSections;

SyntheticSection::SyntheticSection(const char *segname, const char *name)
    : … { … }

// dyld3's MachOLoaded::getSlide() assumes that the __TEXT segment starts
// from the beginning of the file (i.e. the header).
MachHeaderSection::MachHeaderSection()
    : … { … }

void MachHeaderSection::addLoadCommand(LoadCommand *lc) { … }

uint64_t MachHeaderSection::getSize() const { … }

static uint32_t cpuSubtype() { … }

static bool hasWeakBinding() { … }

static bool hasNonWeakDefinition() { … }

void MachHeaderSection::writeTo(uint8_t *buf) const { … }

PageZeroSection::PageZeroSection()
    : … { … }

RebaseSection::RebaseSection()
    : … { … }

namespace {
struct RebaseState { … };
} // namespace

static void emitIncrement(uint64_t incr, raw_svector_ostream &os) { … }

static void flushRebase(const RebaseState &state, raw_svector_ostream &os) { … }

// Rebases are communicated to dyld using a bytecode, whose opcodes cause the
// memory location at a specific address to be rebased and/or the address to be
// incremented.
//
// Opcode REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB is the most generic
// one, encoding a series of evenly spaced addresses. This algorithm works by
// splitting up the sorted list of addresses into such chunks. If the locations
// are consecutive or the sequence consists of a single location, flushRebase
// will use a smaller, more specialized encoding.
static void encodeRebases(const OutputSegment *seg,
                          MutableArrayRef<Location> locations,
                          raw_svector_ostream &os) { … }

void RebaseSection::finalizeContents() { … }

void RebaseSection::writeTo(uint8_t *buf) const { … }

NonLazyPointerSectionBase::NonLazyPointerSectionBase(const char *segname,
                                                     const char *name)
    : … { … }

void macho::addNonLazyBindingEntries(const Symbol *sym,
                                     const InputSection *isec, uint64_t offset,
                                     int64_t addend) { … }

void NonLazyPointerSectionBase::addEntry(Symbol *sym) { … }

void macho::writeChainedRebase(uint8_t *buf, uint64_t targetVA) { … }

static void writeChainedBind(uint8_t *buf, const Symbol *sym, int64_t addend) { … }

void macho::writeChainedFixup(uint8_t *buf, const Symbol *sym, int64_t addend) { … }

void NonLazyPointerSectionBase::writeTo(uint8_t *buf) const { … }

GotSection::GotSection()
    : … { … }

TlvPointerSection::TlvPointerSection()
    : … { … }

BindingSection::BindingSection()
    : … { … }

namespace {
struct Binding { … };
struct BindIR { … };
} // namespace

// Encode a sequence of opcodes that tell dyld to write the address of symbol +
// addend at osec->addr + outSecOff.
//
// The bind opcode "interpreter" remembers the values of each binding field, so
// we only need to encode the differences between bindings. Hence the use of
// lastBinding.
static void encodeBinding(const OutputSection *osec, uint64_t outSecOff,
                          int64_t addend, Binding &lastBinding,
                          std::vector<BindIR> &opcodes) { … }

static void optimizeOpcodes(std::vector<BindIR> &opcodes) { … }

static void flushOpcodes(const BindIR &op, raw_svector_ostream &os) { … }

static bool needsWeakBind(const Symbol &sym) { … }

// Non-weak bindings need to have their dylib ordinal encoded as well.
static int16_t ordinalForDylibSymbol(const DylibSymbol &dysym) { … }

static int16_t ordinalForSymbol(const Symbol &sym) { … }

static void encodeDylibOrdinal(int16_t ordinal, raw_svector_ostream &os) { … }

static void encodeWeakOverride(const Defined *defined,
                               raw_svector_ostream &os) { … }

// Organize the bindings so we can encoded them with fewer opcodes.
//
// First, all bindings for a given symbol should be grouped together.
// BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM is the largest opcode (since it
// has an associated symbol string), so we only want to emit it once per symbol.
//
// Within each group, we sort the bindings by address. Since bindings are
// delta-encoded, sorting them allows for a more compact result. Note that
// sorting by address alone ensures that bindings for the same segment / section
// are located together, minimizing the number of times we have to emit
// BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB.
//
// Finally, we sort the symbols by the address of their first binding, again
// to facilitate the delta-encoding process.
template <class Sym>
std::vector<std::pair<const Sym *, std::vector<BindingEntry>>>
sortBindings(const BindingsMap<const Sym *> &bindingsMap) { … }

// Emit bind opcodes, which are a stream of byte-sized opcodes that dyld
// interprets to update a record with the following fields:
//  * segment index (of the segment to write the symbol addresses to, typically
//    the __DATA_CONST segment which contains the GOT)
//  * offset within the segment, indicating the next location to write a binding
//  * symbol type
//  * symbol library ordinal (the index of its library's LC_LOAD_DYLIB command)
//  * symbol name
//  * addend
// When dyld sees BIND_OPCODE_DO_BIND, it uses the current record state to bind
// a symbol in the GOT, and increments the segment offset to point to the next
// entry. It does *not* clear the record state after doing the bind, so
// subsequent opcodes only need to encode the differences between bindings.
void BindingSection::finalizeContents() { … }

void BindingSection::writeTo(uint8_t *buf) const { … }

WeakBindingSection::WeakBindingSection()
    : … { … }

void WeakBindingSection::finalizeContents() { … }

void WeakBindingSection::writeTo(uint8_t *buf) const { … }

StubsSection::StubsSection()
    : … { … }

uint64_t StubsSection::getSize() const { … }

void StubsSection::writeTo(uint8_t *buf) const { … }

void StubsSection::finalize() { … }

static void addBindingsForStub(Symbol *sym) { … }

void StubsSection::addEntry(Symbol *sym) { … }

StubHelperSection::StubHelperSection()
    : … { … }

uint64_t StubHelperSection::getSize() const { … }

bool StubHelperSection::isNeeded() const { … }

void StubHelperSection::writeTo(uint8_t *buf) const { … }

void StubHelperSection::setUp() { … }

llvm::DenseMap<llvm::CachedHashStringRef, ConcatInputSection *>
    ObjCSelRefsHelper::methnameToSelref;
void ObjCSelRefsHelper::initialize() { … }

void ObjCSelRefsHelper::cleanup() { … }

ConcatInputSection *ObjCSelRefsHelper::makeSelRef(StringRef methname) { … }

ConcatInputSection *ObjCSelRefsHelper::getSelRef(StringRef methname) { … }

ObjCStubsSection::ObjCStubsSection()
    : … { … }

bool ObjCStubsSection::isObjCStubSymbol(Symbol *sym) { … }

StringRef ObjCStubsSection::getMethname(Symbol *sym) { … }

void ObjCStubsSection::addEntry(Symbol *sym) { … }

void ObjCStubsSection::setUp() { … }

uint64_t ObjCStubsSection::getSize() const { … }

void ObjCStubsSection::writeTo(uint8_t *buf) const { … }

LazyPointerSection::LazyPointerSection()
    : … { … }

uint64_t LazyPointerSection::getSize() const { … }

bool LazyPointerSection::isNeeded() const { … }

void LazyPointerSection::writeTo(uint8_t *buf) const { … }

LazyBindingSection::LazyBindingSection()
    : … { … }

void LazyBindingSection::finalizeContents() { … }

void LazyBindingSection::writeTo(uint8_t *buf) const { … }

void LazyBindingSection::addEntry(Symbol *sym) { … }

// Unlike the non-lazy binding section, the bind opcodes in this section aren't
// interpreted all at once. Rather, dyld will start interpreting opcodes at a
// given offset, typically only binding a single symbol before it finds a
// BIND_OPCODE_DONE terminator. As such, unlike in the non-lazy-binding case,
// we cannot encode just the differences between symbols; we have to emit the
// complete bind information for each symbol.
uint32_t LazyBindingSection::encode(const Symbol &sym) { … }

ExportSection::ExportSection()
    : … { … }

void ExportSection::finalizeContents() { … }

void ExportSection::writeTo(uint8_t *buf) const { … }

DataInCodeSection::DataInCodeSection()
    : … { … }

template <class LP>
static std::vector<MachO::data_in_code_entry> collectDataInCodeEntries() { … }

void DataInCodeSection::finalizeContents() { … }

void DataInCodeSection::writeTo(uint8_t *buf) const { … }

FunctionStartsSection::FunctionStartsSection()
    : … { … }

void FunctionStartsSection::finalizeContents() { … }

void FunctionStartsSection::writeTo(uint8_t *buf) const { … }

SymtabSection::SymtabSection(StringTableSection &stringTableSection)
    : … { … }

void SymtabSection::emitBeginSourceStab(StringRef sourceFile) { … }

void SymtabSection::emitEndSourceStab() { … }

void SymtabSection::emitObjectFileStab(ObjFile *file) { … }

void SymtabSection::emitEndFunStab(Defined *defined) { … }

void SymtabSection::emitStabs() { … }

void SymtabSection::finalizeContents() { … }

uint32_t SymtabSection::getNumSymbols() const { … }

// This serves to hide (type-erase) the template parameter from SymtabSection.
template <class LP> class SymtabSectionImpl final : public SymtabSection { … };

template <class LP> uint64_t SymtabSectionImpl<LP>::getRawSize() const { … }

template <class LP> void SymtabSectionImpl<LP>::writeTo(uint8_t *buf) const { … }

template <class LP>
SymtabSection *
macho::makeSymtabSection(StringTableSection &stringTableSection) { … }

IndirectSymtabSection::IndirectSymtabSection()
    : … { … }

uint32_t IndirectSymtabSection::getNumSymbols() const { … }

bool IndirectSymtabSection::isNeeded() const { … }

void IndirectSymtabSection::finalizeContents() { … }

static uint32_t indirectValue(const Symbol *sym) { … }

void IndirectSymtabSection::writeTo(uint8_t *buf) const { … }

StringTableSection::StringTableSection()
    : … { … }

uint32_t StringTableSection::addString(StringRef str) { … }

void StringTableSection::writeTo(uint8_t *buf) const { … }

static_assert …;
static_assert …;

CodeSignatureSection::CodeSignatureSection()
    : … { … }

uint32_t CodeSignatureSection::getBlockCount() const { … }

uint64_t CodeSignatureSection::getRawSize() const { … }

void CodeSignatureSection::writeHashes(uint8_t *buf) const { … }

void CodeSignatureSection::writeTo(uint8_t *buf) const { … }

CStringSection::CStringSection(const char *name)
    : … { … }

void CStringSection::addInput(CStringInputSection *isec) { … }

void CStringSection::writeTo(uint8_t *buf) const { … }

void CStringSection::finalizeContents() { … }

// Mergeable cstring literals are found under the __TEXT,__cstring section. In
// contrast to ELF, which puts strings that need different alignments into
// different sections, clang's Mach-O backend puts them all in one section.
// Strings that need to be aligned have the .p2align directive emitted before
// them, which simply translates into zero padding in the object file. In other
// words, we have to infer the desired alignment of these cstrings from their
// addresses.
//
// We differ slightly from ld64 in how we've chosen to align these cstrings.
// Both LLD and ld64 preserve the number of trailing zeros in each cstring's
// address in the input object files. When deduplicating identical cstrings,
// both linkers pick the cstring whose address has more trailing zeros, and
// preserve the alignment of that address in the final binary. However, ld64
// goes a step further and also preserves the offset of the cstring from the
// last section-aligned address.  I.e. if a cstring is at offset 18 in the
// input, with a section alignment of 16, then both LLD and ld64 will ensure the
// final address is 2-byte aligned (since 18 == 16 + 2). But ld64 will also
// ensure that the final address is of the form 16 * k + 2 for some k.
//
// Note that ld64's heuristic means that a dedup'ed cstring's final address is
// dependent on the order of the input object files. E.g. if in addition to the
// cstring at offset 18 above, we have a duplicate one in another file with a
// `.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
// the cstring from the object file earlier on the command line (since both have
// the same number of trailing zeros in their address). So the final cstring may
// either be at some address `16 * k + 2` or at some address `2 * k`.
//
// I've opted not to follow this behavior primarily for implementation
// simplicity, and secondarily to save a few more bytes. It's not clear to me
// that preserving the section alignment + offset is ever necessary, and there
// are many cases that are clearly redundant. In particular, if an x86_64 object
// file contains some strings that are accessed via SIMD instructions, then the
// .cstring section in the object file will be 16-byte-aligned (since SIMD
// requires its operand addresses to be 16-byte aligned). However, there will
// typically also be other cstrings in the same file that aren't used via SIMD
// and don't need this alignment. They will be emitted at some arbitrary address
// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16
// % A`.
void DeduplicatedCStringSection::finalizeContents() { … }

void DeduplicatedCStringSection::writeTo(uint8_t *buf) const { … }

DeduplicatedCStringSection::StringOffset
DeduplicatedCStringSection::getStringOffset(StringRef str) const { … }

// This section is actually emitted as __TEXT,__const by ld64, but clang may
// emit input sections of that name, and LLD doesn't currently support mixing
// synthetic and concat-type OutputSections. To work around this, I've given
// our merged-literals section a different name.
WordLiteralSection::WordLiteralSection()
    : … { … }

void WordLiteralSection::addInput(WordLiteralInputSection *isec) { … }

void WordLiteralSection::finalizeContents() { … }

void WordLiteralSection::writeTo(uint8_t *buf) const { … }

ObjCImageInfoSection::ObjCImageInfoSection()
    : … { … }

ObjCImageInfoSection::ImageInfo
ObjCImageInfoSection::parseImageInfo(const InputFile *file) { … }

static std::string swiftVersionString(uint8_t version) { … }

// Validate each object file's __objc_imageinfo and use them to generate the
// image info for the output binary. Only two pieces of info are relevant:
// 1. The Swift version (should be identical across inputs)
// 2. `bool hasCategoryClassProperties` (true only if true for all inputs)
void ObjCImageInfoSection::finalizeContents() { … }

void ObjCImageInfoSection::writeTo(uint8_t *buf) const { … }

InitOffsetsSection::InitOffsetsSection()
    : … { … }

uint64_t InitOffsetsSection::getSize() const { … }

void InitOffsetsSection::writeTo(uint8_t *buf) const { … }

// The inputs are __mod_init_func sections, which contain pointers to
// initializer functions, therefore all relocations should be of the UNSIGNED
// type. InitOffsetsSection stores offsets, so if the initializer's address is
// not known at link time, stub-indirection has to be used.
void InitOffsetsSection::setUp() { … }

ObjCMethListSection::ObjCMethListSection()
    : … { … }

// Go through all input method lists and ensure that we have selrefs for all
// their method names. The selrefs will be needed later by ::writeTo. We need to
// create them early on here to ensure they are processed correctly by the lld
// pipeline.
void ObjCMethListSection::setUp() { … }

// Calculate section size and final offsets for where InputSection's need to be
// written.
void ObjCMethListSection::finalize() { … }

void ObjCMethListSection::writeTo(uint8_t *bufStart) const { … }

// Check if an InputSection is a method list. To do this we scan the
// InputSection for any symbols who's names match the patterns we expect clang
// to generate for method lists.
bool ObjCMethListSection::isMethodList(const ConcatInputSection *isec) { … }

// Encode a single relative offset value. The input is the data/symbol at
// (&isec->data[inSecOff]). The output is written to (&buf[outSecOff]).
// 'createSelRef' indicates that we should not directly use the specified
// symbol, but instead get the selRef for the symbol and use that instead.
void ObjCMethListSection::writeRelativeOffsetForIsec(
    const ConcatInputSection *isec, uint8_t *buf, uint32_t &inSecOff,
    uint32_t &outSecOff, bool useSelRef) const { … }

// Write a relative method list to buf, return the size of the written
// information
uint32_t
ObjCMethListSection::writeRelativeMethodList(const ConcatInputSection *isec,
                                             uint8_t *buf) const { … }

// Given the size of an ObjC method list InputSection, return the size of the
// method list when encoded in relative offsets format. We can do this without
// decoding the actual data, as it can be directly inferred from the size of the
// isec.
uint32_t ObjCMethListSection::computeRelativeMethodListSize(
    uint32_t absoluteMethodListSize) const { … }

// Read a method list header from buf
void ObjCMethListSection::readMethodListHeader(const uint8_t *buf,
                                               uint32_t &structSizeAndFlags,
                                               uint32_t &structCount) const { … }

// Write a method list header to buf
void ObjCMethListSection::writeMethodListHeader(uint8_t *buf,
                                                uint32_t structSizeAndFlags,
                                                uint32_t structCount) const { … }

void macho::createSyntheticSymbols() { … }

ChainedFixupsSection::ChainedFixupsSection()
    : … { … }

bool ChainedFixupsSection::isNeeded() const { … }

void ChainedFixupsSection::addBinding(const Symbol *sym,
                                      const InputSection *isec, uint64_t offset,
                                      int64_t addend) { … }

std::pair<uint32_t, uint8_t>
ChainedFixupsSection::getBinding(const Symbol *sym, int64_t addend) const { … }

static size_t writeImport(uint8_t *buf, int format, int16_t libOrdinal,
                          bool weakRef, uint32_t nameOffset, int64_t addend) { … }

size_t ChainedFixupsSection::SegmentInfo::getSize() const { … }

size_t ChainedFixupsSection::SegmentInfo::writeTo(uint8_t *buf) const { … }

static size_t importEntrySize(int format) { … }

// This is step 3 of the algorithm described in the class comment of
// ChainedFixupsSection.
//
// LC_DYLD_CHAINED_FIXUPS data consists of (in this order):
// * A dyld_chained_fixups_header
// * A dyld_chained_starts_in_image
// * One dyld_chained_starts_in_segment per segment
// * List of all imports (dyld_chained_import, dyld_chained_import_addend, or
//   dyld_chained_import_addend64)
// * Names of imported symbols
void ChainedFixupsSection::writeTo(uint8_t *buf) const { … }

// This is step 2 of the algorithm described in the class comment of
// ChainedFixupsSection.
void ChainedFixupsSection::finalizeContents() { … }

template SymtabSection *macho::makeSymtabSection<LP64>(StringTableSection &);
template SymtabSection *macho::makeSymtabSection<ILP32>(StringTableSection &);
llvm/lld/MachO/SyntheticSections.cpp