AMDGPUImageIntrinsicOptimizer.cpp | Explore in Territory

//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
//
// - they refer to the same vaddr except for sample_id,
// - they use a constant sample_id and they fall into the same group,
// - they have the same dmask and the number of intrinsics and the number of
//   vaddr/vdata dword transfers is reduced by the combine.
//
// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
//
// +----------+-----+-----+-------+---------+------------+---------+----------+
// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
// |  (dmask) |     |     |       | vdata   |            | vdata   |          |
// +----------+-----+-----+-------+---------+------------+---------+----------+
// |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
// +----------+-----+-----+-------+---------+------------+---------+----------+
// |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
// +----------+-----+-----+-------+---------+------------+---------+----------+
// |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
// +----------+-----+-----+-------+---------+------------+---------+----------+
// |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
// +----------+-----+-----+-------+---------+------------+---------+----------+
// |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
// +----------+-----+-----+-------+---------+------------+---------+----------+
//
// Some cases are of questionable benefit, like the one marked with "yes?"
// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
// and TX, but higher vdata. We start by erring on the side of converting these
// to MSAA_LOAD.
//
// clang-format off
//
// This pass will combine intrinsics such as (not neccessarily consecutive):
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
// ==>
//  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
//
// clang-format on
//
// Future improvements:
//
// - We may occasionally not want to do the combine if it increases the maximum
//   register pressure.
//
// - Ensure clausing when multiple MSAA_LOAD are generated.
//
// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
// we don't know the format at compile time.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Pass.h"
#include "llvm/Support/raw_ostream.h"

usingnamespacellvm;

#define DEBUG_TYPE …

namespace {
class AMDGPUImageIntrinsicOptimizer : public FunctionPass { … }; // End of class AMDGPUImageIntrinsicOptimizer
} // End anonymous namespace

INITIALIZE_PASS(…)

char AMDGPUImageIntrinsicOptimizer::ID = …;

void addInstToMergeableList(
    IntrinsicInst *II,
    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { … }

// Collect list of all instructions we know how to merge in a subset of the
// block. It returns an iterator to the instruction after the last one analyzed.
BasicBlock::iterator collectMergeableInsts(
    BasicBlock::iterator I, BasicBlock::iterator E,
    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { … }

bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { … }

static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { … }

bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { … }

FunctionPass *
llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { … }

PreservedAnalyses
AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
                                       FunctionAnalysisManager &AM) { … }
llvm/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp