// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -canonicalize -buffer-loop-hoisting -drop-equivalent-buffer-results -split-input-file | FileCheck %s
// Run fuzzer with different seeds.
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
// Test bufferization using memref types that have no layout map.
// RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
// TODO: Some test cases from this file should be moved to other dialects.
// CHECK-LABEL: func @fill_inplace(
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
// CHECK-NO-LAYOUT-MAP-LABEL: func @fill_inplace(%{{.*}}: memref<?xf32>) {
func.func @fill_inplace(
%A : tensor<?xf32> {bufferization.writable = true})
-> tensor<?xf32>
{
// CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
%f0 = arith.constant 0.0 : f32
/// Inplaceable, no alloc
// CHECK-NOT: alloc
// CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[A]] : memref<?xf32, strided<[?], offset: ?>>)
%r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r: tensor<?xf32>
}
// -----
/// No bufferization.writable flag, must allocate.
// CHECK-LABEL: func @not_inplace(
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32> {
// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?xf32>) -> memref<?xf32>
func.func @not_inplace(
%A : tensor<?xf32> {bufferization.writable = false})
-> tensor<?xf32>
{
// CHECK: %[[F0:.*]] = arith.constant 0.000000e+00 : f32
%f0 = arith.constant 0.0 : f32
// CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, strided<[?], offset: ?>>
// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 64 : i64} : memref<?xf32>
// CHECK: linalg.fill ins(%[[F0]] : f32) outs(%[[ALLOC]] : memref<?xf32>)
%r = linalg.fill ins(%f0 : f32) outs(%A : tensor<?xf32>) -> tensor<?xf32>
// CHECK-NOT: dealloc
// CHECK: return %[[ALLOC]] : memref<?xf32>
return %r: tensor<?xf32>
}
// -----
// CHECK-LABEL: func @not_inplace
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, strided<[?, ?], offset: ?>>) {
// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?x?xf32>) {
func.func @not_inplace(
%A : tensor<?x?xf32> {bufferization.writable = true})
-> tensor<?x?xf32>
{
%f0 = arith.constant 0.0 : f32
/// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
// CHECK: %[[ALLOC:.*]] = memref.alloc
// CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[ALLOC]]
%f = linalg.fill ins(%f0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
/// The second op has no interfering reads and can reuse.
// CHECK-NOT: alloc
// CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
%r = linalg.matmul ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%A: tensor<?x?xf32>)
-> tensor<?x?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r: tensor<?x?xf32>
}
// -----
// CHECK-LABEL: func @not_inplace
func.func @not_inplace(
%A : tensor<?x?xf32> {bufferization.writable = true}) -> tensor<?x?xf32> {
/// Within op multiple uses of %A, must alloc.
// CHECK: alloc
%r = linalg.matmul ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%A: tensor<?x?xf32>)
-> tensor<?x?xf32>
// CHECK-NOT: dealloc
return %r: tensor<?x?xf32>
}
// -----
// CHECK-LABEL: func @vec_inplace
func.func @vec_inplace(
%A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
-> tensor<?xf32>
{
%c0 = arith.constant 0 : index
// CHECK-NOT: alloc
%r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r: tensor<?xf32>
}
// -----
// CHECK-LABEL: func @vec_not_inplace
// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
func.func @vec_not_inplace(
%A : tensor<?xf32> {bufferization.writable = true}, %vec : vector<4xf32>)
-> (tensor<?xf32>, tensor<?xf32>)
{
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
/// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
// CHECK: %[[ALLOC:.*]] = memref.alloc
// CHECK: memref.copy {{.*}}, %[[ALLOC]]
// CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
%r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
/// The second vector.transfer has no interfering reads and can reuse the buffer.
// CHECK-NOT: alloc
// CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
%r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>
// CHECK: return
// CHECK-NOT: tensor
return %r0, %r1: tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK: func @matmul(
// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref<128x192xf32>
func.func @matmul(
%A: tensor<128x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
%B: tensor<256x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
%C: tensor<128x192xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
-> tensor<128x192xf32> {
%c0 = arith.constant 0 : index
%c256 = arith.constant 256 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%c128 = arith.constant 128 : index
%c192 = arith.constant 192 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
// Hoisted alloc.
// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>
// CHECK: scf.for %[[I:.*]] =
%0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
%1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
tensor<128x256xf32> to tensor<8x256xf32>
// CHECK: scf.for %[[J:.*]] =
%2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) {
%3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
tensor<256x192xf32> to tensor<256x16xf32>
// Insert an artificial out-of-place buffer by extracting from %C instead
// of %arg6.
%4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
tensor<128x192xf32> to tensor<8x16xf32>
// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
// CHECK: scf.for %[[K:.*]] =
%6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) {
%8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] :
tensor<8x256xf32> to tensor<8x32xf32>
%9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] :
tensor<256x16xf32> to tensor<32x16xf32>
// linalg.matmul is inplace as well as the enclosing scf.for.
// CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
%10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
outs(%arg8 : tensor<8x16xf32>)
-> tensor<8x16xf32>
scf.yield %10 : tensor<8x16xf32>
}
// insert_slice is inplace but its source comes from an equivalent buffer
// that is not in place. So we must insert a copy of the small buffer into
// the bigger buffer.
// CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
// CHECK: memref.copy %[[ALLOC]], %[[T]]
%7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
tensor<8x16xf32> into tensor<128x192xf32>
scf.yield %7 : tensor<128x192xf32>
}
scf.yield %2 : tensor<128x192xf32>
}
return %0 : tensor<128x192xf32>
}
// -----
/// This test just checks the produced IR is valid and does not have dominance
/// errors in the def-use chains.
// CHECK-LABEL: func @dominance_violation_bug_1
func.func @dominance_violation_bug_1(
%A : tensor<?x?xf32> {bufferization.writable = false},
%idx : index)
-> tensor<?x?xf32>
{
%f0 = arith.constant 0.0 : f32
%sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
%FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
%rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
%rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
return %rA : tensor<?x?xf32>
}
// -----
func.func @gather_like(
%arg0 : tensor<?x?xf32> {bufferization.writable = false},
%arg1 : tensor<?xi32> {bufferization.writable = false},
%arg2 : tensor<?x?xf32> {bufferization.writable = true})
-> tensor<?x?xf32>
{
%0 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0)>,
affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%arg1 : tensor<?xi32>) outs(%arg2 : tensor<?x?xf32>) {
^bb0(%arg3: i32, %arg4 : f32):
%iv1 = linalg.index 1 : index
%1 = arith.index_cast %arg3: i32 to index
%2 = tensor.extract %arg0[%1, %iv1] : tensor<?x?xf32>
linalg.yield %2 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
// CHECK-LABEL: func @gather_like(
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32,
// CHECK-SAME: %[[ARG1:.+]]: memref<?xi32
// CHECK-SAME: %[[ARG2:.+]]: memref<?x?xf32
// CHECK-SAME: ) {
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG1]] :
// CHECK-SAME: outs(%[[ARG2]] :
// CHECK: %[[YIELD:.+]] = memref.load %[[ARG0]]
// CHECK: linalg.yield %[[YIELD]]
// -----
// CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input
// CHECK-SAME: %[[t1:.*]]: memref<?x?xf32, strided{{.*}}>, %[[t2:.*]]: memref<?xf32, strided{{.*}}>, %[[t3:.*]]: memref<?x?xf32, strided{{.*}}>
func.func @linalg_op_bufferizes_inplace_with_input(
%t1: tensor<?x?xf32> {bufferization.writable = true},
%t2: tensor<?xf32> {bufferization.writable = true},
%t3: tensor<?x?xf32> {bufferization.writable = true},
%s1: index, %s2: index, %cst: f32)
-> tensor<?x?xf32>
{
// CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}})
%r = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>,
affine_map<(d0, d1)-> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%t1, %t2 : tensor<?x?xf32>, tensor<?xf32>)
outs(%t3 : tensor<?x?xf32>) {
^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) :
%add = arith.addf %arg0, %arg1 : f32
linalg.yield %add : f32
} -> tensor<?x?xf32>
return %r : tensor<?x?xf32>
}
// -----
#accesses = [
affine_map<(i) -> (i)>
]
#trait = {
indexing_maps = #accesses,
iterator_types = ["parallel"]
}
// CHECK-LABEL: func @op_is_reading_but_following_ops_are_not
// CHECK-SAME: %[[t0:.*]]: memref<?xf32
func.func @op_is_reading_but_following_ops_are_not(
%t0 : tensor<?xf32> {bufferization.writable = false},
%cst : f32)
-> tensor<?xf32>
{
// Make sure that a copy is inserted here.
// CHECK: %[[ALLOC:.*]] = memref.alloc
// CHECK: memref.copy %[[t0]], %[[ALLOC]]
// CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
%r0 =linalg.generic #trait outs (%t0 : tensor<?xf32>) {
^bb(%0: f32) :
%a = arith.addf %cst, %0 : f32
linalg.yield %a : f32
} -> (tensor<?xf32>)
// CHECK: linalg.generic {{.*}} outs(%[[ALLOC]] : memref
%r1 = linalg.generic #trait outs (%r0 : tensor<?xf32>) {
^bb(%0: f32) :
linalg.yield %cst : f32
} -> (tensor<?xf32>)
// CHECK: return %[[ALLOC]]
return %r1 : tensor<?xf32>
}
// -----
// CHECK-LABEL: func @map_binary
// CHECK-SAME: %[[LHS:[0-9a-zA-Z]*]]: memref<64xf32
// CHECK-SAME: %[[RHS:[0-9a-zA-Z]*]]: memref<64xf32
func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>,
%init: tensor<64xf32>) -> tensor<64xf32> {
// CHECK: linalg.map { arith.addf } ins(%[[LHS]], %[[RHS]] : memref<64xf32
%add = linalg.map
ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>)
outs(%init:tensor<64xf32>)
(%lhs_elem: f32, %rhs_elem: f32) {
%0 = arith.addf %lhs_elem, %rhs_elem: f32
linalg.yield %0: f32
}
func.return %add : tensor<64xf32>
}
// -----
// CHECK-LABEL: func @reduce
// CHECK-SAME: %[[INPUT:.*]]: memref<16x32x64xf32
func.func @reduce(%input: tensor<16x32x64xf32>,
%init: tensor<16x64xf32>) -> tensor<16x64xf32> {
// CHECK: linalg.reduce { arith.addf } ins(%[[INPUT]] : memref<16x32x64xf32
%reduce = linalg.reduce
ins(%input:tensor<16x32x64xf32>)
outs(%init:tensor<16x64xf32>)
dimensions = [1]
(%in: f32, %out: f32) {
%0 = arith.addf %out, %in: f32
linalg.yield %0: f32
}
func.return %reduce : tensor<16x64xf32>
}
// -----
// CHECK-LABEL: func @transpose
// CHECK-SAME: %[[ARG0:.*]]: memref<16x32x64xf32
func.func @transpose(%input: tensor<16x32x64xf32>,
%init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
// CHECK: linalg.transpose ins(%[[ARG0]] : memref<16x32x64xf32
%transpose = linalg.transpose
ins(%input:tensor<16x32x64xf32>)
outs(%init:tensor<32x64x16xf32>)
permutation = [1, 2, 0]
func.return %transpose : tensor<32x64x16xf32>
}
// -----
// CHECK-LABEL: func @broadcast
// CHECK-SAME: %[[ARG0:.*]]: memref<8x32xf32
func.func @broadcast(%input: tensor<8x32xf32>,
%init: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
%bcast = linalg.broadcast
ins(%input:tensor<8x32xf32>)
outs(%init:tensor<8x16x32xf32>)
dimensions = [1]
func.return %bcast : tensor<8x16x32xf32>
}
// -----
//===----------------------------------------------------------------------===//
// AllocTensorOp elimination would produce SSA violations for the example below.
//===----------------------------------------------------------------------===//
func.func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>)
-> tensor<?x1x6x8xf32> {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%0 = bufferization.alloc_tensor() : tensor<4x1x6x8xf32>
%1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
%2 = bufferization.alloc_tensor() : tensor<1x6x8xf32>
%3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
%5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
scf.yield %5 : tensor<?x1x6x8xf32>
}
return %3 : tensor<?x1x6x8xf32>
}
// -----
// CHECK-LABEL: func @do_not_copy_alloc_tensors(
func.func @do_not_copy_alloc_tensors(%f1: f32, %f2: f32, %idx: index)
-> (tensor<5xf32>, tensor<5xf32>)
{
// CHECK: memref.alloc
// CHECK: memref.alloc
// CHECK-NOT: copy
// CHECK: memref.store
// CHECK: memref.store
%0 = bufferization.alloc_tensor() : tensor<5xf32>
%1 = tensor.insert %f1 into %0[%idx] : tensor<5xf32>
%2 = tensor.insert %f2 into %0[%idx] : tensor<5xf32>
return %1, %2 : tensor<5xf32>, tensor<5xf32>
}