ParallelSnippetGenerator.cpp | Explore in Territory

//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "ParallelSnippetGenerator.h"

#include "BenchmarkRunner.h"
#include "MCInstrDescView.h"
#include "Target.h"

// FIXME: Load constants into registers (e.g. with fld1) to not break
// instructions like x87.

// Ideally we would like the only limitation on executing instructions to be the
// availability of the CPU resources (e.g. execution ports) needed to execute
// them, instead of the availability of their data dependencies.

// To achieve that, one approach is to generate instructions that do not have
// data dependencies between them.
//
// For some instructions, this is trivial:
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
// For the above snippet, haswell just renames rax four times and executes the
// four instructions two at a time on P23 and P0126.
//
// For some instructions, we just need to make sure that the source is
// different from the destination. For example, IDIV8r reads from GPR and
// writes to AX. We just need to ensure that the Var is assigned a
// register which is different from AX:
//    idiv bx
//    idiv bx
//    idiv bx
//    idiv bx
// The above snippet will be able to fully saturate the ports, while the same
// with ax would issue one uop every `latency(IDIV8r)` cycles.
//
// Some instructions make this harder because they both read and write from
// the same register:
//    inc rax
//    inc rax
//    inc rax
//    inc rax
// This has a data dependency from each instruction to the next, limit the
// number of instructions that can be issued in parallel.
// It turns out that this is not a big issue on recent Intel CPUs because they
// have heuristics to balance port pressure. In the snippet above, subsequent
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
// might end up executing them all on P0 (just because they can), or try
// avoiding P5 because it's usually under high pressure from vector
// instructions.
// This issue is even more important for high-latency instructions because
// they increase the idle time of the CPU, e.g. :
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//
// To avoid that, we do the renaming statically by generating as many
// independent exclusive assignments as possible (until all possible registers
// are exhausted) e.g.:
//    imul rax, rbx
//    imul rcx, rbx
//    imul rdx, rbx
//    imul r8,  rbx
//
// Some instruction even make the above static renaming impossible because
// they implicitly read and write from the same operand, e.g. ADC16rr reads
// and writes from EFLAGS.
// In that case we just use a greedy register assignment and hope for the
// best.

namespace llvm {
namespace exegesis {

static bool hasVariablesWithTiedOperands(const Instruction &Instr) { … }

ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;

void ParallelSnippetGenerator::instantiateMemoryOperands(
    const unsigned ScratchSpacePointerInReg,
    std::vector<InstructionTemplate> &Instructions) const { … }

enum class RegRandomizationStrategy : uint8_t { … };

} // namespace exegesis

template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { … };

namespace exegesis {

const char *getDescription(RegRandomizationStrategy S) { … }

static std::variant<std::nullopt_t, MCOperand, Register>
generateSingleRegisterForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const BitVector &ForbiddenRegisters,
    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
    const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
    const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
    RegRandomizationStrategy S) { … }

static std::optional<InstructionTemplate>
generateSingleSnippetForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const BitVector &ForbiddenRegisters,
    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
    BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
    const ArrayRef<InstructionTemplate> Instructions,
    RegRandomizationStrategy S) { … }

static std::vector<InstructionTemplate>
generateSnippetForInstrAvoidingDefUseOverlap(
    const LLVMState &State, const InstructionTemplate &IT,
    RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { … }

Expected<std::vector<CodeTemplate>>
ParallelSnippetGenerator::generateCodeTemplates(
    InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { … }

constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;

} // namespace exegesis
} // namespace llvm
llvm/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp