// // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains a pass that performs optimization on SIMD instructions // with high latency by splitting them into more efficient series of // instructions. // // 1. Rewrite certain SIMD instructions with vector element due to their // inefficiency on some targets. // // For example: // fmla v0.4s, v1.4s, v2.s[1] // // Is rewritten into: // dup v3.4s, v2.s[1] // fmla v0.4s, v1.4s, v3.4s // // 2. Rewrite interleaved memory access instructions due to their // inefficiency on some targets. // // For example: // st2 {v0.4s, v1.4s}, addr // // Is rewritten into: // zip1 v2.4s, v0.4s, v1.4s // zip2 v3.4s, v0.4s, v1.4s // stp q2, q3, addr // //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" #include <unordered_map> #include <map> usingnamespacellvm; #define DEBUG_TYPE … STATISTIC(NumModifiedInstr, "Number of SIMD instructions modified"); #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME … namespace { struct AArch64SIMDInstrOpt : public MachineFunctionPass { … }; char AArch64SIMDInstrOpt::ID = …; } // end anonymous namespace INITIALIZE_PASS(…) /// Based only on latency of instructions, determine if it is cost efficient /// to replace the instruction InstDesc by the instructions stored in the /// array InstDescRepl. /// Return true if replacement is expected to be faster. bool AArch64SIMDInstrOpt:: shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { … } /// Determine if we need to exit this pass for a kind of instruction replacement /// early. This makes sure that no compile time is spent in this pass for /// targets with no need for any of these optimizations beyond performing this /// check. /// Return true if early exit of this pass for a kind of instruction /// replacement is recommended for a target. bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { … } /// Check whether an equivalent DUP instruction has already been /// created or not. /// Return true when the DUP instruction already exists. In this case, /// DestReg will point to the destination of the already created DUP. bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, unsigned LaneNumber, unsigned *DestReg) const { … } /// Certain SIMD instructions with vector element operand are not efficient. /// Rewrite them into SIMD instructions with vector operands. This rewrite /// is driven by the latency of the instructions. /// The instruction of concerns are for the time being FMLA, FMLS, FMUL, /// and FMULX and hence they are hardcoded. /// /// For example: /// fmla v0.4s, v1.4s, v2.s[1] /// /// Is rewritten into /// dup v3.4s, v2.s[1] // DUP not necessary if redundant /// fmla v0.4s, v1.4s, v3.4s /// /// Return true if the SIMD instruction is modified. bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { … } /// Load/Store Interleaving instructions are not always beneficial. /// Replace them by ZIP instructions and classical load/store. /// /// For example: /// st2 {v0.4s, v1.4s}, addr /// /// Is rewritten into: /// zip1 v2.4s, v0.4s, v1.4s /// zip2 v3.4s, v0.4s, v1.4s /// stp q2, q3, addr // /// For example: /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr /// /// Is rewritten into: /// zip1 v4.4s, v0.4s, v2.4s /// zip2 v5.4s, v0.4s, v2.4s /// zip1 v6.4s, v1.4s, v3.4s /// zip2 v7.4s, v1.4s, v3.4s /// zip1 v8.4s, v4.4s, v6.4s /// zip2 v9.4s, v4.4s, v6.4s /// zip1 v10.4s, v5.4s, v7.4s /// zip2 v11.4s, v5.4s, v7.4s /// stp q8, q9, addr /// stp q10, q11, addr+32 /// /// Currently only instructions related to ST2 and ST4 are considered. /// Other may be added later. /// Return true if the SIMD instruction is modified. bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { … } /// Process The REG_SEQUENCE instruction, and extract the source /// operands of the ST2/4 instruction from it. /// Example of such instruction. /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; /// Return true when the instruction is processed successfully. bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { … } /// Return the number of useful source registers for this instruction /// (2 for ST2 and 4 for ST4). unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { … } bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { … } /// Returns an instance of the high cost ASIMD instruction replacement /// optimization pass. FunctionPass *llvm::createAArch64SIMDInstrOptPass() { … }