//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// Armv6 introduced instructions to perform 32-bit SIMD operations. The /// purpose of this pass is do some IR pattern matching to create ACLE /// DSP intrinsics, which map on these 32-bit SIMD operations. /// This pass runs only when unaligned accesses is supported/enabled. // //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" usingnamespacellvm; usingnamespacePatternMatch; #define DEBUG_TYPE … STATISTIC(NumSMLAD , "Number of smlad instructions generated"); static cl::opt<bool> DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass")); static cl::opt<unsigned> NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed")); namespace { struct MulCandidate; class Reduction; MulCandList; MemInstList; MulPairList; // 'MulCandidate' holds the multiplication instructions that are candidates // for parallel execution. struct MulCandidate { … }; /// Represent a sequence of multiply-accumulate operations with the aim to /// perform the multiplications in parallel. class Reduction { … }; class WidenedLoad { … }; class ARMParallelDSP : public FunctionPass { … }; } bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem) { … } // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP // instructions, which is set to 16. So here we should collect all i8 and i16 // narrow operations. // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template<unsigned MaxBitWidth> bool ARMParallelDSP::IsNarrowSequence(Value *V) { … } /// Iterate through the block and record base, offset pairs of loads which can /// be widened into a single load. bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { … } // Search recursively back through the operands to find a tree of values that // form a multiply-accumulate chain. The search records the Add and Mul // instructions that form the reduction and allows us to find a single value // to be used as the initial input to the accumlator. bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { … } // The pass needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: // // acc0 = ... // ld0 = load i16 // sext0 = sext i16 %ld0 to i32 // ld1 = load i16 // sext1 = sext i16 %ld1 to i32 // mul0 = mul %sext0, %sext1 // ld2 = load i16 // sext2 = sext i16 %ld2 to i32 // ld3 = load i16 // sext3 = sext i16 %ld3 to i32 // mul1 = mul i32 %sext2, %sext3 // add0 = add i32 %mul0, %acc0 // acc1 = add i32 %add0, %mul1 // // Which can be selected to: // // ldr r0 // ldr r1 // smlad r2, r0, r1, r2 // // If constants are used instead of loads, these will need to be hoisted // out and into a register. // // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // bool ARMParallelDSP::MatchSMLAD(Function &F) { … } bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { … } void ARMParallelDSP::InsertParallelMACs(Reduction &R) { … } LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy) { … } Pass *llvm::createARMParallelDSPPass() { … } char ARMParallelDSP::ID = …; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false)