llvm/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/fill-1d.mlir

// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -lower-vector-mask -one-shot-bufferize -buffer-deallocation-pipeline -test-lower-to-llvm | \
// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils | \
// RUN: FileCheck %s

func.func @entry() {
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %step = arith.constant 1 : index
  %c1_f32 = arith.constant 123.0 : f32

  %vscale = vector.vscale
  %vl_fp = arith.muli %c4, %vscale : index
  %vec = bufferization.alloc_tensor(%vl_fp) : tensor<?xf32>

  %vec_out = scf.for %i = %c0 to %vl_fp step %step iter_args(%vin = %vec) -> tensor<?xf32> {
    %vout = tensor.insert %c1_f32 into %vin[%i] : tensor<?xf32>
    scf.yield %vout : tensor<?xf32>
  }

  %pi = arith.constant  3.14 : f32
  %vec_out_1 = linalg.fill ins(%pi : f32) outs(%vec_out : tensor<?xf32>) -> tensor<?xf32>

  // There are at least 4 f32 elements in every SVE vector. For implementations
  // with wider vectors, you should see more elements being printed.
  // CHECK: 3.14
  // CHECK: 3.14
  // CHECK: 3.14
  // CHECK: 3.14
  scf.for %i = %c0 to %vl_fp step %step {
    %element = tensor.extract %vec_out_1[%i] : tensor<?xf32>
    vector.print %element : f32
  }

  // CHECK: SVE: END OF TEST OUTPUT
  vector.print str "SVE: END OF TEST OUTPUT"

  return
}

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
    transform.yield
  }
}