//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "ParallelSnippetGenerator.h" #include "BenchmarkRunner.h" #include "MCInstrDescView.h" #include "Target.h" // FIXME: Load constants into registers (e.g. with fld1) to not break // instructions like x87. // Ideally we would like the only limitation on executing instructions to be the // availability of the CPU resources (e.g. execution ports) needed to execute // them, instead of the availability of their data dependencies. // To achieve that, one approach is to generate instructions that do not have // data dependencies between them. // // For some instructions, this is trivial: // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // For the above snippet, haswell just renames rax four times and executes the // four instructions two at a time on P23 and P0126. // // For some instructions, we just need to make sure that the source is // different from the destination. For example, IDIV8r reads from GPR and // writes to AX. We just need to ensure that the Var is assigned a // register which is different from AX: // idiv bx // idiv bx // idiv bx // idiv bx // The above snippet will be able to fully saturate the ports, while the same // with ax would issue one uop every `latency(IDIV8r)` cycles. // // Some instructions make this harder because they both read and write from // the same register: // inc rax // inc rax // inc rax // inc rax // This has a data dependency from each instruction to the next, limit the // number of instructions that can be issued in parallel. // It turns out that this is not a big issue on recent Intel CPUs because they // have heuristics to balance port pressure. In the snippet above, subsequent // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs // might end up executing them all on P0 (just because they can), or try // avoiding P5 because it's usually under high pressure from vector // instructions. // This issue is even more important for high-latency instructions because // they increase the idle time of the CPU, e.g. : // imul rax, rbx // imul rax, rbx // imul rax, rbx // imul rax, rbx // // To avoid that, we do the renaming statically by generating as many // independent exclusive assignments as possible (until all possible registers // are exhausted) e.g.: // imul rax, rbx // imul rcx, rbx // imul rdx, rbx // imul r8, rbx // // Some instruction even make the above static renaming impossible because // they implicitly read and write from the same operand, e.g. ADC16rr reads // and writes from EFLAGS. // In that case we just use a greedy register assignment and hope for the // best. namespace llvm { namespace exegesis { static bool hasVariablesWithTiedOperands(const Instruction &Instr) { … } ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; void ParallelSnippetGenerator::instantiateMemoryOperands( const unsigned ScratchSpacePointerInReg, std::vector<InstructionTemplate> &Instructions) const { … } enum class RegRandomizationStrategy : uint8_t { … }; } // namespace exegesis template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { … }; namespace exegesis { const char *getDescription(RegRandomizationStrategy S) { … } static std::variant<std::nullopt_t, MCOperand, Register> generateSingleRegisterForInstrAvoidingDefUseOverlap( const LLVMState &State, const BitVector &ForbiddenRegisters, const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, const Operand &Op, const ArrayRef<InstructionTemplate> Instructions, RegRandomizationStrategy S) { … } static std::optional<InstructionTemplate> generateSingleSnippetForInstrAvoidingDefUseOverlap( const LLVMState &State, const BitVector &ForbiddenRegisters, const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, BitVector &Uses, BitVector &Defs, InstructionTemplate IT, const ArrayRef<InstructionTemplate> Instructions, RegRandomizationStrategy S) { … } static std::vector<InstructionTemplate> generateSnippetForInstrAvoidingDefUseOverlap( const LLVMState &State, const InstructionTemplate &IT, RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { … } Expected<std::vector<CodeTemplate>> ParallelSnippetGenerator::generateCodeTemplates( InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { … } constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; } // namespace exegesis } // namespace llvm