[GPU] Vector shape does not match the layout
hanhanW opened this issue · comments
@Max191 and I looked at enabling PadAndVectorDistribution pipeline and found that it failed in vector distribution in one of cases. To repro: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmgpu-vector-distribute{test-layout}, canonicalize, cse))' ~/repro.mlir
func.func @foo() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f16
%c320 = arith.constant 320 : index
%c64 = arith.constant 64 : index
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x16x4xf16>
%alloc = memref.alloc() : memref<1x64x4xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<1x16x64xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%4 = affine.min affine_map<()[s0] -> (s0 * -16 + 968, 16)>()[%workgroup_id_x]
%subview = memref.subview %0[%workgroup_id_y, %3, 0] [1, %4, 320] [1, 1, 1] : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x320xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[%workgroup_id_y, 0, 0] [1, 320, 4] [1, 1, 1] : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>> to memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg0 = %c0 to %c320 step %c64 iter_args(%arg1 = %cst_0) -> (vector<1x16x4xf16>) {
%subview_4 = memref.subview %subview[0, 0, %arg0] [1, %4, 64] [1, 1, 1] : memref<1x?x320xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x16x64xf16>
%7 = vector.transfer_read %subview_2[%c0, %arg0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x4xf16>
gpu.barrier
vector.transfer_write %6, %alloc_1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x16x64xf16>, memref<1x16x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %7, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x4xf16>, memref<1x64x4xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc_1[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x16x64xf16, #gpu.address_space<workgroup>>, vector<1x16x64xf16>
%9 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x4xf16, #gpu.address_space<workgroup>>, vector<1x64x4xf16>
%10 = arith.extf %arg1 : vector<1x16x4xf16> to vector<1x16x4xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %9, %10 : vector<1x16x64xf16>, vector<1x64x4xf16> into vector<1x16x4xf32>
%12 = arith.truncf %11 : vector<1x16x4xf32> to vector<1x16x4xf16>
scf.yield %12 : vector<1x16x4xf16>
}
%subview_3 = memref.subview %2[%workgroup_id_y, %3, 0] [1, %4, 4] [1, 1, 1] : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x4xf16, strided<[3872, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %5, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x16x4xf16>, memref<1x?x4xf16, strided<[3872, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_1 : memref<1x16x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x4xf16, #gpu.address_space<workgroup>>
return
}
Error:
transfer '%7 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x16x64xf16>' vector layout: #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 2, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 8, 8], elements_per_thread = [1, 1, 8], subgroup_basis = [1, 1, 1], thread_basis = [1, 8, 8]>
transfer '%8 = vector.transfer_read %subview_2[%c0, %arg0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x4xf16>' vector layout: #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 64, 1], elements_per_thread = [1, 1, 4], subgroup_basis = [1, 1, 1], thread_basis = [1, 64, 1]>
/home/hanchung/z.mlir:31:10: error: Vector shape: [1, 64, 4] does not match the layout (nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 4, 0], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], subgroup_order = [0, 2, 1], subgroup_basis = [1, 1, 1, 1], subgroup_active_ids = [true, false, true, true], thread_basis = [1, 4, 16]>) at dim 2. Dimension expected by layout: 0 actual: 4
%9 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x4xf16, #gpu.address_space<workgroup>>, vector<1x64x4xf16>
^
/home/hanchung/z.mlir:1:1: error: 'func.func' op failed to distribute
@qedawkins @Groverkss could you take a look?
batches_per_subgroup = [1, 4, 0]
Looks like the config setting the layout for contraction messed up there. I can have a look tomorrow and send a fix.
This looks wrong to me
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %9, %10 : vector<1x16x64xf16>, vector<1x64x4xf16> into vector<1x16x4xf32>
There is no intrinsic implemented for a 16x4
tile
Can you give me the original mlir file? I can at least add a failure when there is no intrinsic available for this shape.
Here is the input IR:
hal.executable public @conv_2d_nchw_fchw_dispatch_1 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x968x4x320_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x968x4x320_f16xf16xf32() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c39649280 = arith.constant 39649280 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x320xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x320x4xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c39649280) : !flow.dispatch.tensor<writeonly:tensor<64x968x4xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 320], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x320xf16>> -> tensor<64x968x320xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 320, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x320x4xf16>> -> tensor<64x320x4xf16>
%5 = tensor.empty() : tensor<64x968x4xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x968x4xf32>) -> tensor<64x968x4xf32>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x320xf16>, tensor<64x320x4xf16>) outs(%6 : tensor<64x968x4xf32>) -> tensor<64x968x4xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 4], strides = [1, 1, 1] : tensor<64x968x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x968x4xf32>>
return
}
}
}
}
However, do we need it to add a failure? I thought that all the information needed for vector distribution is in the previous snippet. The bug is in the tile size configuration, which is in my prototype?