AArch64SIMDInstrOpt.cpp | Explore in Territory

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass that performs optimization on SIMD instructions
// with high latency by splitting them into more efficient series of
// instructions.
//
// 1. Rewrite certain SIMD instructions with vector element due to their
// inefficiency on some targets.
//
// For example:
//    fmla v0.4s, v1.4s, v2.s[1]
//
// Is rewritten into:
//    dup v3.4s, v2.s[1]
//    fmla v0.4s, v1.4s, v3.4s
//
// 2. Rewrite interleaved memory access instructions due to their
// inefficiency on some targets.
//
// For example:
//    st2 {v0.4s, v1.4s}, addr
//
// Is rewritten into:
//    zip1 v2.4s, v0.4s, v1.4s
//    zip2 v3.4s, v0.4s, v1.4s
//    stp  q2, q3,  addr
//
//===----------------------------------------------------------------------===//

#include "AArch64InstrInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Pass.h"
#include <unordered_map>
#include <map>

usingnamespacellvm;

#define DEBUG_TYPE …

STATISTIC(NumModifiedInstr,
          "Number of SIMD instructions modified");

#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME …

namespace {

struct AArch64SIMDInstrOpt : public MachineFunctionPass { … };

char AArch64SIMDInstrOpt::ID = …;

} // end anonymous namespace

INITIALIZE_PASS(…)

/// Based only on latency of instructions, determine if it is cost efficient
/// to replace the instruction InstDesc by the instructions stored in the
/// array InstDescRepl.
/// Return true if replacement is expected to be faster.
bool AArch64SIMDInstrOpt::
shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
                  SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { … }

/// Determine if we need to exit this pass for a kind of instruction replacement
/// early. This makes sure that no compile time is spent in this pass for
/// targets with no need for any of these optimizations beyond performing this
/// check.
/// Return true if early exit of this pass for a kind of instruction
/// replacement is recommended for a target.
bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { … }

/// Check whether an equivalent DUP instruction has already been
/// created or not.
/// Return true when the DUP instruction already exists. In this case,
/// DestReg will point to the destination of the already created DUP.
bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
                                         unsigned SrcReg, unsigned LaneNumber,
                                         unsigned *DestReg) const { … }

/// Certain SIMD instructions with vector element operand are not efficient.
/// Rewrite them into SIMD instructions with vector operands. This rewrite
/// is driven by the latency of the instructions.
/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
/// and FMULX and hence they are hardcoded.
///
/// For example:
///    fmla v0.4s, v1.4s, v2.s[1]
///
/// Is rewritten into
///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
///    fmla v0.4s, v1.4s, v3.4s
///
/// Return true if the SIMD instruction is modified.
bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { … }

/// Load/Store Interleaving instructions are not always beneficial.
/// Replace them by ZIP instructions and classical load/store.
///
/// For example:
///    st2 {v0.4s, v1.4s}, addr
///
/// Is rewritten into:
///    zip1 v2.4s, v0.4s, v1.4s
///    zip2 v3.4s, v0.4s, v1.4s
///    stp  q2, q3, addr
//
/// For example:
///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
///
/// Is rewritten into:
///    zip1 v4.4s, v0.4s, v2.4s
///    zip2 v5.4s, v0.4s, v2.4s
///    zip1 v6.4s, v1.4s, v3.4s
///    zip2 v7.4s, v1.4s, v3.4s
///    zip1 v8.4s, v4.4s, v6.4s
///    zip2 v9.4s, v4.4s, v6.4s
///    zip1 v10.4s, v5.4s, v7.4s
///    zip2 v11.4s, v5.4s, v7.4s
///    stp  q8, q9, addr
///    stp  q10, q11, addr+32
///
/// Currently only instructions related to ST2 and ST4 are considered.
/// Other may be added later.
/// Return true if the SIMD instruction is modified.
bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { … }

/// Process The REG_SEQUENCE instruction, and extract the source
/// operands of the ST2/4 instruction from it.
/// Example of such instruction.
///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
/// Return true when the instruction is processed successfully.
bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
     unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { … }

/// Return the number of useful source registers for this instruction
/// (2 for ST2 and 4 for ST4).
unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { … }

bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { … }

/// Returns an instance of the high cost ASIMD instruction replacement
/// optimization pass.
FunctionPass *llvm::createAArch64SIMDInstrOptPass() { … }
llvm/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp