llvm/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir

// RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8" --cse --split-input-file %s | FileCheck %s
// RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=32" --cse --split-input-file %s | FileCheck %s --check-prefix=CHECK32

func.func @vector_load_i8(%arg1: index, %arg2: index) -> vector<4xi8> {
    %0 = memref.alloc() : memref<3x4xi8>
    %1 = vector.load %0[%arg1, %arg2] : memref<3x4xi8>, vector<4xi8>
    return %1 : vector<4xi8>
}
// Expect no conversions, i8 is supported.
//      CHECK: func @vector_load_i8(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x4xi8>
// CHECK-NEXT:   [[L:%.+]] = vector.load %[[ALLOC]][%[[ARG0]], %[[ARG1]]] : memref<3x4xi8>, vector<4xi8>
// CHECK-NEXT:   return

//      CHECK32: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
//      CHECK32: func @vector_load_i8(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[VECLOAD:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi32>, vector<1xi32>
//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VECLOAD]] : vector<1xi32> to vector<4xi8>
//      CHECK32:   return %[[VEC_I4]]

// -----

func.func @vector_load_i4(%arg1: index, %arg2: index) -> vector<3x8xi4> {
    %0 = memref.alloc() : memref<3x8xi4>
    %cst = arith.constant dense<0> : vector<3x8xi4>
    %1 = vector.load %0[%arg1, %arg2] : memref<3x8xi4>, vector<8xi4>
    %2 = vector.insert %1, %cst [0] : vector<8xi4> into vector<3x8xi4>
    return %2 : vector<3x8xi4>
}
//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
//      CHECK: func @vector_load_i4
// CHECK-SAME:     (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
//      CHECK:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK:   %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<12xi8>, vector<4xi8>
//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xi4>

//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
//      CHECK32: func @vector_load_i4
// CHECK32-SAME:     (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi32>, vector<1xi32>
//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>

// -----

func.func @vector_load_i4_dynamic(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> vector<8xi4> {
  %0 = memref.alloc(%arg0, %arg1) : memref<?x?xi4>
  %1 = vector.load %0[%arg2, %arg3] : memref<?x?xi4>, vector<8xi4>
  return %1 : vector<8xi4>
}
//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 2)>
//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 2)>
//      CHECK: func.func @vector_load_i4_dynamic(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: index
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: index
// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9_]+]]: index
//      CHECK:   %[[SIZE:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK:   %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi8>
//      CHECK:   %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG1]], %[[ARG3]]]
//      CHECK:   %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<?xi8>, vector<4xi8>
//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xi4>

//  CHECK32-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 8)>
//  CHECK32-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 8)>
//      CHECK32: func.func @vector_load_i4_dynamic(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: index
// CHECK32-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: index
// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: index
// CHECK32-SAME:     %[[ARG3:[a-zA-Z0-9_]+]]: index
//      CHECK32:   %[[SIZE:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi32>
//      CHECK32:   %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[ARG1]], %[[ARG3]]]
//      CHECK32:   %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<?xi32>, vector<1xi32>
//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>

// -----

func.func @vector_transfer_read_i4(%arg1: index, %arg2: index) -> vector<8xi4> {
    %c0 = arith.constant 0 : i4
    %0 = memref.alloc() : memref<3x8xi4>
    %1 = vector.transfer_read %0[%arg1, %arg2], %c0 {in_bounds = [true]} :
      memref<3x8xi4>, vector<8xi4>
    return %1 : vector<8xi4>
}
//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
//      CHECK: func @vector_transfer_read_i4
// CHECK-SAME:     (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
//      CHECK:   %[[CONST:.+]] = arith.constant 0 : i4
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
//      CHECK:   %[[PAD:.+]] = arith.extui %[[CONST]] : i4 to i8
//      CHECK:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<12xi8>, vector<4xi8>
//      CHECK:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xi4>

//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
//      CHECK32: func @vector_transfer_read_i4
// CHECK32-SAME:     (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
//      CHECK32:   %[[CONST:.+]] = arith.constant 0 : i4
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[PAD:.+]] = arith.extui %[[CONST]] : i4 to i32
//      CHECK32:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<3xi32>, vector<1xi32>
//      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>

// -----

func.func @vector_maskedload_i8(%arg1: index, %arg2: index, %arg3: index, %passthru: vector<4xi8>) -> vector<4xi8> {
    %0 = memref.alloc() : memref<3x4xi8>
    %mask = vector.create_mask %arg3 : vector<4xi1>
    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
      memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
    return %1 : vector<4xi8>
}
// Expect no conversions, i8 is supported.
//      CHECK: func @vector_maskedload_i8(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x4xi8>
// CHECK-NEXT:   %[[MASK:.+]] = vector.create_mask %[[ARG2]] : vector<4xi1>
// CHECK-NEXT:   [[L:%.+]] = vector.maskedload %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[ARG3]] :
// CHECK-SAME:     memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
// CHECK-NEXT:   return

//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
//      CHECK32: func @vector_maskedload_i8(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<4xi1>
//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<1xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
//      CHECK32:   return %[[SELECT]]

// -----

func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passthru: vector<8xi4>) -> vector<3x8xi4> {
    %0 = memref.alloc() : memref<3x8xi4>
    %cst = arith.constant dense<0> : vector<3x8xi4>
    %mask = vector.create_mask %arg3 : vector<8xi1>
    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
      memref<3x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
    %2 = vector.insert %1, %cst [0] : vector<8xi4> into vector<3x8xi4>
    return %2 : vector<3x8xi4>
}
//  CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
//  CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
//      CHECK: func @vector_maskedload_i4(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
//      CHECK:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
//      CHECK:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
//      CHECK:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<4xi1>
//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<4xi8>
//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>

//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
//      CHECK32: func @vector_maskedload_i4(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<1xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<1xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>

// -----

func.func @vector_cst_maskedload_i8(%arg1: index, %arg2: index, %passthru: vector<4xi8>) -> vector<4xi8> {
    %0 = memref.alloc() : memref<3x4xi8>
    %mask = vector.constant_mask [2] : vector<4xi1>
    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
      memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
    return %1 : vector<4xi8>
}
// Expect no conversions, i8 is supported.
//      CHECK: func @vector_cst_maskedload_i8(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<4xi8>)
// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x4xi8>
// CHECK-NEXT:   %[[MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
// CHECK-NEXT:   [[L:%.+]] = vector.maskedload %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[ARG2]] :
// CHECK-SAME:     memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
// CHECK-NEXT:   return

//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
//      CHECK32: func @vector_cst_maskedload_i8(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK32-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<1xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
//      CHECK32:   return %[[SELECT]]

// -----

func.func @vector_cst_maskedload_i4(%arg1: index, %arg2: index, %passthru: vector<8xi4>) -> vector<3x8xi4> {
    %0 = memref.alloc() : memref<3x8xi4>
    %cst = arith.constant dense<0> : vector<3x8xi4>
    %mask = vector.constant_mask [4] : vector<8xi1>
    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
      memref<3x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
    %2 = vector.insert %1, %cst [0] : vector<8xi4> into vector<3x8xi4>
    return %2 : vector<3x8xi4>
}
//  CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
//      CHECK: func @vector_cst_maskedload_i4(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<8xi4>)
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
//      CHECK:   %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
//      CHECK:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK:   %[[NEW_MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<4xi8>
//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>

//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
//      CHECK32: func @vector_cst_maskedload_i4(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<8xi4>)
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<1xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<1xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>

// -----

func.func @vector_extract_maskedload_i4(%arg1: index) -> vector<8x8x16xi4> {
    %0 = memref.alloc() : memref<8x8x16xi4>
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %cst_1 = arith.constant dense<0> : vector<8x8x16xi4>
    %cst_2 = arith.constant dense<0> : vector<16xi4>
    %27 = vector.create_mask %c8, %arg1, %c16 : vector<8x8x16xi1>
    %48 = vector.extract %27[0] : vector<8x16xi1> from vector<8x8x16xi1>
    %49 = vector.extract %48[0] : vector<16xi1> from vector<8x16xi1>
    %50 = vector.maskedload %0[%c0, %c0, %c0], %49, %cst_2 : memref<8x8x16xi4>, vector<16xi1>, vector<16xi4> into vector<16xi4>
    %63 = vector.insert %50, %cst_1 [0, 0] : vector<16xi4> into vector<8x8x16xi4>
    return %63 : vector<8x8x16xi4>
}
//      CHECK: func @vector_extract_maskedload_i4(
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<512xi8>
//      CHECK:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
//      CHECK:   %[[ORIG_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x16xi1>
//      CHECK:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
//      CHECK:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
//      CHECK:   %[[NEW_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x8xi1>
//      CHECK:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x8xi1>
//      CHECK:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<8xi1>
//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
// CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>

//      CHECK32: func @vector_extract_maskedload_i4(
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
//      CHECK32:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x16xi1>
//      CHECK32:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
//      CHECK32:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x2xi1>
//      CHECK32:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x2xi1>
//      CHECK32:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<2xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>

// -----

func.func @vector_extract_cst_maskedload_i4() -> vector<8x8x16xi4> {
    %0 = memref.alloc() : memref<8x8x16xi4>
    %c0 = arith.constant 0 : index
    %cst_1 = arith.constant dense<0> : vector<8x8x16xi4>
    %cst_2 = arith.constant dense<0> : vector<16xi4>
    %27 = vector.constant_mask [8, 4, 16] : vector<8x8x16xi1>
    %48 = vector.extract %27[0] : vector<8x16xi1> from vector<8x8x16xi1>
    %49 = vector.extract %48[0] : vector<16xi1> from vector<8x16xi1>
    %50 = vector.maskedload %0[%c0, %c0, %c0], %49, %cst_2 : memref<8x8x16xi4>, vector<16xi1>, vector<16xi4> into vector<16xi4>
    %63 = vector.insert %50, %cst_1 [0, 0] : vector<16xi4> into vector<8x8x16xi4>
    return %63 : vector<8x8x16xi4>
}
//      CHECK: func @vector_extract_cst_maskedload_i4(
//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<512xi8>
//      CHECK:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
//      CHECK:   %[[ORIG_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x16xi1>
//      CHECK:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
//      CHECK:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
//      CHECK:   %[[NEW_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x8xi1>
//      CHECK:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x8xi1>
//      CHECK:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<8xi1>
//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
// CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>

//      CHECK32: func @vector_extract_cst_maskedload_i4(
//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
//      CHECK32:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x16xi1>
//      CHECK32:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
//      CHECK32:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x2xi1>
//      CHECK32:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x2xi1>
//      CHECK32:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<2xi1>
//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
// CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>

// -----

func.func @vector_store_i8(%arg0: vector<8xi8>, %arg1: index, %arg2: index) {
    %0 = memref.alloc() : memref<4x8xi8>
    vector.store %arg0, %0[%arg1, %arg2] :memref<4x8xi8>, vector<8xi8>
    return
}

// Expect no conversions, i8 is supported.
//      CHECK: func @vector_store_i8
//      CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<4x8xi8>
//      CHECK: vector.store %[[ARG0]], %[[ALLOC:.+]][%[[ARG1]], %[[ARG2]]] : memref<4x8xi8>, vector<8xi8>

//  CHECK32-DAG: affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
//      CHECK32: func @vector_store_i8
//      CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<8xi32>
//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
//      CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xi8> to vector<2xi32>
//      CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<8xi32>, vector<2xi32

// -----

func.func @vector_store_i4(%arg0: vector<8xi4>, %arg1: index, %arg2: index) {
    %0 = memref.alloc() : memref<4x8xi4>
    vector.store %arg0, %0[%arg1, %arg2] :memref<4x8xi4>, vector<8xi4>
    return
}

//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
//      CHECK: func @vector_store_i4
//      CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16xi8>
//      CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
//      CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
//      CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<16xi8>, vector<4xi8>

//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
//      CHECK32: func @vector_store_i4
//      CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<4xi32>
//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
//      CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<1xi32>
//      CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<4xi32>, vector<1xi32>

// -----

func.func @vector_store_i4_dynamic(%arg0: vector<8xi4>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
    %0 = memref.alloc(%arg1, %arg2) : memref<?x?xi4>
    vector.store %arg0, %0[%arg3, %arg4] : memref<?x?xi4>, vector<8xi4>
    return
}

//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 2)>
//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 2)>
//      CHECK: func @vector_store_i4_dynamic
// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
//      CHECK: %[[SIZE:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
//      CHECK: %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi8>
//      CHECK: %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG2]], %[[ARG4]]]
//      CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
//      CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<?xi8>, vector<4xi8>

//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 8)>
//  CHECK32-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 8)>
//      CHECK32: func @vector_store_i4_dynamic
// CHECK32-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
// CHECK32-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
// CHECK32-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
// CHECK32-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
// CHECK32-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
//      CHECK32: %[[SIZE:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
//      CHECK32: %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi32>
//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG2]], %[[ARG4]]]
//      CHECK32: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<1xi32>
//      CHECK32: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<?xi32>, vector<1xi32>

// -----

func.func @vector_maskedstore_i8(%arg0: index, %arg1: index, %arg2: index, %value: vector<8xi8>) {
  %0 = memref.alloc() : memref<3x8xi8>
  %mask = vector.create_mask %arg2 : vector<8xi1>
  vector.maskedstore %0[%arg0, %arg1], %mask, %value : memref<3x8xi8>, vector<8xi1>, vector<8xi8>
  return
}
// Expect no conversions, i8 is supported.
//      CHECK: func @vector_maskedstore_i8(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]
// CHECK-SAME:     %[[VAL:[a-zA-Z0-9]+]]
// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x8xi8>
// CHECK-NEXT:   %[[MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
// CHECK-NEXT:   vector.maskedstore %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[VAL]]
// CHECK-NEXT:   return

// CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
// CHECK32:     func @vector_maskedstore_i8(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
// CHECK32-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]
// CHECK32-SAME:     %[[VAL:[a-zA-Z0-9]+]]
// CHECK32:        %[[ALLOC:.+]] = memref.alloc() : memref<6xi32>
// CHECK32:        %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
// CHECK32:        %[[LIDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
// CHECK32:        %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
// CHECK32:        %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<2xi1>
// CHECK32:        %[[PASS_THRU:.+]] = arith.constant dense<0> : vector<2xi32>
// CHECK32:        %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LIDX]]], %[[NEW_MASK]], %[[PASS_THRU]]
// CHECK32:        %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<8xi8>
// CHECK32:        %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[VAL]], %[[BITCAST]] : vector<8xi1>, vector<8xi8>
// CHECK32:        %[[NEW_VAL:.+]] = vector.bitcast %[[SELECT]] : vector<8xi8> to vector<2xi32>
// CHECK32:        vector.maskedstore %[[ALLOC]][%[[LIDX]]], %[[NEW_MASK]], %[[NEW_VAL]]

// -----

func.func @vector_cst_maskedstore_i8(%arg0: index, %arg1: index, %value: vector<8xi8>) {
  %0 = memref.alloc() : memref<3x8xi8>
  %mask = vector.constant_mask [4] : vector<8xi1>
  vector.maskedstore %0[%arg0, %arg1], %mask, %value : memref<3x8xi8>, vector<8xi1>, vector<8xi8>
  return
}
// Expect no conversions, i8 is supported.
//      CHECK: func @vector_cst_maskedstore_i8(
// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
// CHECK-SAME:     %[[VAL:[a-zA-Z0-9]+]]
// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x8xi8>
// CHECK-NEXT:   %[[MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
// CHECK-NEXT:   vector.maskedstore %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[VAL]]
// CHECK-NEXT:   return

// CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
// CHECK32:     func @vector_cst_maskedstore_i8(
// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
// CHECK32-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
// CHECK32-SAME:     %[[VAL:[a-zA-Z0-9]+]]
// CHECK32:        %[[ALLOC:.+]] = memref.alloc() : memref<6xi32>
// CHECK32:        %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
// CHECK32:        %[[LIDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
// CHECK32:        %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<2xi1>
// CHECK32:        %[[PASS_THRU:.+]] = arith.constant dense<0> : vector<2xi32>
// CHECK32:        %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LIDX]]], %[[NEW_MASK]], %[[PASS_THRU]]
// CHECK32:        %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<8xi8>
// CHECK32:        %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[VAL]], %[[BITCAST]] : vector<8xi1>, vector<8xi8>
// CHECK32:        %[[NEW_VAL:.+]] = vector.bitcast %[[SELECT]] : vector<8xi8> to vector<2xi32>
// CHECK32:        vector.maskedstore %[[ALLOC]][%[[LIDX]]], %[[NEW_MASK]], %[[NEW_VAL]]