//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements utilities that allow one to create IR moving the data // across different levels of the GPU memory hierarchy. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h" #include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/Pass/Pass.h" usingnamespacemlir; usingnamespacemlir::gpu; /// Emits the (imperfect) loop nest performing the copy between "from" and "to" /// values using the bounds derived from the "from" value. Emits at least /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with /// single-iteration loops. Maps the innermost loops to thread dimensions, in /// reverse order to enable access coalescing in the innermost loop. static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) { … } /// Emits the loop nests performing the copy to the designated location in the /// beginning of the region, and from the designated location immediately before /// the terminator of the first block of the region. The region is expected to /// have one block. This boils down to the following structure /// /// ^bb(...): /// <loop-bound-computation> /// for %arg0 = ... to ... step ... { /// ... /// for %argN = <thread-id-x> to ... step <block-dim-x> { /// %0 = load %from[%arg0, ..., %argN] /// store %0, %to[%arg0, ..., %argN] /// } /// ... /// } /// gpu.barrier /// <... original body ...> /// gpu.barrier /// for %arg0 = ... to ... step ... { /// ... /// for %argN = <thread-id-x> to ... step <block-dim-x> { /// %1 = load %to[%arg0, ..., %argN] /// store %1, %from[%arg0, ..., %argN] /// } /// ... /// } /// /// Inserts the barriers unconditionally since different threads may be copying /// values and reading them. An analysis would be required to eliminate barriers /// in case where value is only used by the thread that copies it. Both copies /// are inserted unconditionally, an analysis would be required to only copy /// live-in and live-out values when necessary. This copies the entire memref /// pointed to by "from". In case a smaller block would be sufficient, the /// caller can create a subview of the memref and promote it instead. static void insertCopies(Region ®ion, Location loc, Value from, Value to) { … } /// Promotes a function argument to workgroup memory in the given function. The /// copies will be inserted in the beginning and in the end of the function. void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { … }