//===- SubgroupReduceLowering.cpp - subgroup_reduce lowering patterns -----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Implements gradual lowering of `gpu.subgroup_reduce` ops. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Transforms/Utils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Location.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include <cassert> #include <cstdint> usingnamespacemlir; namespace { /// Example, assumes `maxShuffleBitwidth` equal to 32: /// ``` /// %a = gpu.subgroup_reduce add %x : (vector<3xf16>) -> vector<3xf16> /// ==> /// %v0 = arith.constant dense<0.0> : vector<3xf16> /// %e0 = vector.extract_strided_slice %x /// {offsets = [0], sizes = [2], strides = [1}: vector<3xf32> to vector<2xf32> /// %r0 = gpu.subgroup_reduce add %e0 : (vector<2xf16>) -> vector<2xf16> /// %v1 = vector.insert_strided_slice %r0, %v0 /// {offsets = [0], strides = [1}: vector<2xf32> into vector<3xf32> /// %e1 = vector.extract %x[2] : f16 from vector<2xf16> /// %r1 = gpu.subgroup_reduce add %e1 : (f16) -> f16 /// %a = vector.insert %r1, %v1[2] : f16 into vector<3xf16> /// ``` struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> { … }; /// Example: /// ``` /// %a = gpu.subgroup_reduce add %x : (vector<1xf32>) -> vector<1xf32> /// ==> /// %e0 = vector.extract %x[0] : f32 from vector<1xf32> /// %r0 = gpu.subgroup_reduce add %e0 : (f32) -> f32 /// %a = vector.broadcast %r0 : f32 to vector<1xf32> /// ``` struct ScalarizeSingleElementReduce final : OpRewritePattern<gpu::SubgroupReduceOp> { … }; struct ClusterInfo { … }; static FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { … } /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn` /// and `unpackFn` to convert to the native shuffle type and to the reduction /// type, respectively. For example, with `input` of type `f16`, `packFn` could /// build ops to cast the value to `i32` to perform shuffles, while `unpackFn` /// would cast it back to `f16` to perform arithmetic reduction on. Assumes that /// the subgroup is `subgroupSize` lanes wide and divides it into clusters of /// `clusterSize` lanes starting at lane 0 with a stride of `clusterStride` for /// lanes within a cluster, reducing all lanes in each cluster in parallel. Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc, Value input, gpu::AllReduceOperation mode, const ClusterInfo &ci, function_ref<Value(Value)> packFn, function_ref<Value(Value)> unpackFn) { … } /// Lowers scalar gpu subgroup reductions to a series of shuffles. struct ScalarSubgroupReduceToShuffles final : OpRewritePattern<gpu::SubgroupReduceOp> { … }; /// Lowers vector gpu subgroup reductions to a series of shuffles. struct VectorSubgroupReduceToShuffles final : OpRewritePattern<gpu::SubgroupReduceOp> { … }; } // namespace void mlir::populateGpuBreakDownSubgrupReducePatterns( RewritePatternSet &patterns, unsigned maxShuffleBitwidth, PatternBenefit benefit) { … } void mlir::populateGpuLowerSubgroupReduceToShufflePattenrs( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth, PatternBenefit benefit) { … }