llvm/lld/MachO/SyntheticSections.h

//===- SyntheticSections.h -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLD_MACHO_SYNTHETIC_SECTIONS_H
#define LLD_MACHO_SYNTHETIC_SECTIONS_H

#include "Config.h"
#include "ExportTrie.h"
#include "InputSection.h"
#include "OutputSection.h"
#include "OutputSegment.h"
#include "Target.h"
#include "Writer.h"

#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"

#include <unordered_map>

namespace llvm {
class DWARFUnit;
} // namespace llvm

namespace lld::macho {

class Defined;
class DylibSymbol;
class LoadCommand;
class ObjFile;
class UnwindInfoSection;

class SyntheticSection : public OutputSection {};

// All sections in __LINKEDIT should inherit from this.
class LinkEditSection : public SyntheticSection {};

// The header of the Mach-O file, which must have a file offset of zero.
class MachHeaderSection final : public SyntheticSection {};

// A hidden section that exists solely for the purpose of creating the
// __PAGEZERO segment, which is used to catch null pointer dereferences.
class PageZeroSection final : public SyntheticSection {};

// This is the base class for the GOT and TLVPointer sections, which are nearly
// functionally identical -- they will both be populated by dyld with addresses
// to non-lazily-loaded dylib symbols. The main difference is that the
// TLVPointerSection stores references to thread-local variables.
class NonLazyPointerSectionBase : public SyntheticSection {};

class GotSection final : public NonLazyPointerSectionBase {};

class TlvPointerSection final : public NonLazyPointerSectionBase {};

struct Location {};

// Stores rebase opcodes, which tell dyld where absolute addresses have been
// encoded in the binary. If the binary is not loaded at its preferred address,
// dyld has to rebase these addresses by adding an offset to them.
class RebaseSection final : public LinkEditSection {};

struct BindingEntry {};

BindingsMap;

// Stores bind opcodes for telling dyld which symbols to load non-lazily.
class BindingSection final : public LinkEditSection {};

// Stores bind opcodes for telling dyld which weak symbols need coalescing.
// There are two types of entries in this section:
//
//   1) Non-weak definitions: This is a symbol definition that weak symbols in
//   other dylibs should coalesce to.
//
//   2) Weak bindings: These tell dyld that a given symbol reference should
//   coalesce to a non-weak definition if one is found. Note that unlike the
//   entries in the BindingSection, the bindings here only refer to these
//   symbols by name, but do not specify which dylib to load them from.
class WeakBindingSection final : public LinkEditSection {};

// The following sections implement lazy symbol binding -- very similar to the
// PLT mechanism in ELF.
//
// ELF's .plt section is broken up into two sections in Mach-O: StubsSection
// and StubHelperSection. Calls to functions in dylibs will end up calling into
// StubsSection, which contains indirect jumps to addresses stored in the
// LazyPointerSection (the counterpart to ELF's .plt.got).
//
// We will first describe how non-weak symbols are handled.
//
// At program start, the LazyPointerSection contains addresses that point into
// one of the entry points in the middle of the StubHelperSection. The code in
// StubHelperSection will push on the stack an offset into the
// LazyBindingSection. The push is followed by a jump to the beginning of the
// StubHelperSection (similar to PLT0), which then calls into dyld_stub_binder.
// dyld_stub_binder is a non-lazily-bound symbol, so this call looks it up in
// the GOT.
//
// The stub binder will look up the bind opcodes in the LazyBindingSection at
// the given offset. The bind opcodes will tell the binder to update the
// address in the LazyPointerSection to point to the symbol, so that subsequent
// calls don't have to redo the symbol resolution. The binder will then jump to
// the resolved symbol.
//
// With weak symbols, the situation is slightly different. Since there is no
// "weak lazy" lookup, function calls to weak symbols are always non-lazily
// bound. We emit both regular non-lazy bindings as well as weak bindings, in
// order that the weak bindings may overwrite the non-lazy bindings if an
// appropriate symbol is found at runtime. However, the bound addresses will
// still be written (non-lazily) into the LazyPointerSection.
//
// Symbols are always bound eagerly when chained fixups are used. In that case,
// StubsSection contains indirect jumps to addresses stored in the GotSection.
// The GOT directly contains the fixup entries, which will be replaced by the
// address of the target symbols on load. LazyPointerSection and
// StubHelperSection are not used.

class StubsSection final : public SyntheticSection {};

class StubHelperSection final : public SyntheticSection {};

class ObjCSelRefsHelper {};

// Objective-C stubs are hoisted objc_msgSend calls per selector called in the
// program. Apple Clang produces undefined symbols to each stub, such as
// '_objc_msgSend$foo', which are then synthesized by the linker. The stubs
// load the particular selector 'foo' from __objc_selrefs, setting it to the
// first argument of the objc_msgSend call, and then jumps to objc_msgSend. The
// actual stub contents are mirrored from ld64.
class ObjCStubsSection final : public SyntheticSection {};

// Note that this section may also be targeted by non-lazy bindings. In
// particular, this happens when branch relocations target weak symbols.
class LazyPointerSection final : public SyntheticSection {};

class LazyBindingSection final : public LinkEditSection {};

// Stores a trie that describes the set of exported symbols.
class ExportSection final : public LinkEditSection {};

// Stores 'data in code' entries that describe the locations of data regions
// inside code sections. This is used by llvm-objdump to distinguish jump tables
// and stop them from being disassembled as instructions.
class DataInCodeSection final : public LinkEditSection {};

// Stores ULEB128 delta encoded addresses of functions.
class FunctionStartsSection final : public LinkEditSection {};

// Stores the strings referenced by the symbol table.
class StringTableSection final : public LinkEditSection {};

struct SymtabEntry {};

struct StabsEntry {};

// Symbols of the same type must be laid out contiguously: we choose to emit
// all local symbols first, then external symbols, and finally undefined
// symbols. For each symbol type, the LC_DYSYMTAB load command will record the
// range (start index and total number) of those symbols in the symbol table.
class SymtabSection : public LinkEditSection {};

template <class LP> SymtabSection *makeSymtabSection(StringTableSection &);

// The indirect symbol table is a list of 32-bit integers that serve as indices
// into the (actual) symbol table. The indirect symbol table is a
// concatenation of several sub-arrays of indices, each sub-array belonging to
// a separate section. The starting offset of each sub-array is stored in the
// reserved1 header field of the respective section.
//
// These sub-arrays provide symbol information for sections that store
// contiguous sequences of symbol references. These references can be pointers
// (e.g. those in the GOT and TLVP sections) or assembly sequences (e.g.
// function stubs).
class IndirectSymtabSection final : public LinkEditSection {};

// The code signature comes at the very end of the linked output file.
class CodeSignatureSection final : public LinkEditSection {};

class CStringSection : public SyntheticSection {};

class DeduplicatedCStringSection final : public CStringSection {};

/*
 * This section contains deduplicated literal values. The 16-byte values are
 * laid out first, followed by the 8- and then the 4-byte ones.
 */
class WordLiteralSection final : public SyntheticSection {};

class ObjCImageInfoSection final : public SyntheticSection {};

// This section stores 32-bit __TEXT segment offsets of initializer functions.
//
// The compiler stores pointers to initializers in __mod_init_func. These need
// to be fixed up at load time, which takes time and dirties memory. By
// synthesizing InitOffsetsSection from them, this data can live in the
// read-only __TEXT segment instead. This section is used by default when
// chained fixups are enabled.
//
// There is no similar counterpart to __mod_term_func, as that section is
// deprecated, and static destructors are instead handled by registering them
// via __cxa_atexit from an autogenerated initializer function (see D121736).
class InitOffsetsSection final : public SyntheticSection {};

// This SyntheticSection is for the __objc_methlist section, which contains
// relative method lists if the -objc_relative_method_lists option is enabled.
class ObjCMethListSection final : public SyntheticSection {};

// Chained fixups are a replacement for classic dyld opcodes. In this format,
// most of the metadata necessary for binding symbols and rebasing addresses is
// stored directly in the memory location that will have the fixup applied.
//
// The fixups form singly linked lists; each one covering a single page in
// memory. The __LINKEDIT,__chainfixups section stores the page offset of the
// first fixup of each page; the rest can be found by walking the chain using
// the offset that is embedded in each entry.
//
// This setup allows pages to be relocated lazily at page-in time and without
// being dirtied. The kernel can discard and load them again as needed. This
// technique, called page-in linking, was introduced in macOS 13.
//
// The benefits of this format are:
//  - smaller __LINKEDIT segment, as most of the fixup information is stored in
//    the data segment
//  - faster startup, since not all relocations need to be done upfront
//  - slightly lower memory usage, as fewer pages are dirtied
//
// Userspace x86_64 and arm64 binaries have two types of fixup entries:
//   - Rebase entries contain an absolute address, to which the object's load
//     address will be added to get the final value. This is used for loading
//     the address of a symbol defined in the same binary.
//   - Binding entries are mostly used for symbols imported from other dylibs,
//     but for weakly bound and interposable symbols as well. They are looked up
//     by a (symbol name, library) pair stored in __chainfixups. This import
//     entry also encodes whether the import is weak (i.e. if the symbol is
//     missing, it should be set to null instead of producing a load error).
//     The fixup encodes an ordinal associated with the import, and an optional
//     addend.
//
// The entries are tightly packed 64-bit bitfields. One of the bits specifies
// which kind of fixup to interpret them as.
//
// LLD generates the fixup data in 5 stages:
//   1. While scanning relocations, we make a note of each location that needs
//      a fixup by calling addRebase() or addBinding(). During this, we assign
//      a unique ordinal for each (symbol name, library, addend) import tuple.
//   2. After addresses have been assigned to all sections, and thus the memory
//      layout of the linked image is final; finalizeContents() is called. Here,
//      the page offsets of the chain start entries are calculated.
//   3. ChainedFixupsSection::writeTo() writes the page start offsets and the
//      imports table to the output file.
//   4. Each section's fixup entries are encoded and written to disk in
//      ConcatInputSection::writeTo(), but without writing the offsets that form
//      the chain.
//   5. Finally, each page's (which might correspond to multiple sections)
//      fixups are linked together in Writer::buildFixupChains().
class ChainedFixupsSection final : public LinkEditSection {};

void writeChainedRebase(uint8_t *buf, uint64_t targetVA);
void writeChainedFixup(uint8_t *buf, const Symbol *sym, int64_t addend);

struct InStruct {};

extern InStruct in;
extern std::vector<SyntheticSection *> syntheticSections;

void createSyntheticSymbols();

} // namespace lld::macho

#endif