//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa // or dim=2darraymsaa into a single image_msaa_load intrinsic if: // // - they refer to the same vaddr except for sample_id, // - they use a constant sample_id and they fall into the same group, // - they have the same dmask and the number of intrinsics and the number of // vaddr/vdata dword transfers is reduced by the combine. // // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): // // +----------+-----+-----+-------+---------+------------+---------+----------+ // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | // | (dmask) | | | | vdata | | vdata | | // +----------+-----+-----+-------+---------+------------+---------+----------+ // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | // +----------+-----+-----+-------+---------+------------+---------+----------+ // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | // +----------+-----+-----+-------+---------+------------+---------+----------+ // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | // +----------+-----+-----+-------+---------+------------+---------+----------+ // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | // +----------+-----+-----+-------+---------+------------+---------+----------+ // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | // +----------+-----+-----+-------+---------+------------+---------+----------+ // // Some cases are of questionable benefit, like the one marked with "yes?" // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP // and TX, but higher vdata. We start by erring on the side of converting these // to MSAA_LOAD. // // clang-format off // // This pass will combine intrinsics such as (not neccessarily consecutive): // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) // ==> // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) // // clang-format on // // Future improvements: // // - We may occasionally not want to do the combine if it increases the maximum // register pressure. // // - Ensure clausing when multiple MSAA_LOAD are generated. // // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and // we don't know the format at compile time. //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" usingnamespacellvm; #define DEBUG_TYPE … namespace { class AMDGPUImageIntrinsicOptimizer : public FunctionPass { … }; // End of class AMDGPUImageIntrinsicOptimizer } // End anonymous namespace INITIALIZE_PASS(…) char AMDGPUImageIntrinsicOptimizer::ID = …; void addInstToMergeableList( IntrinsicInst *II, SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { … } // Collect list of all instructions we know how to merge in a subset of the // block. It returns an iterator to the instruction after the last one analyzed. BasicBlock::iterator collectMergeableInsts( BasicBlock::iterator I, BasicBlock::iterator E, SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { … } bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { … } static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { … } bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { … } FunctionPass * llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { … } PreservedAnalyses AMDGPUImageIntrinsicOptimizerPass::run(Function &F, FunctionAnalysisManager &AM) { … }