[GPU] Vector shape does not match the layout

Question

[GPU] Vector shape does not match the layout

hanhanW opened this issue 2 months ago · comments

@Max191 and I looked at enabling PadAndVectorDistribution pipeline and found that it failed in vector distribution in one of cases. To repro: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmgpu-vector-distribute{test-layout}, canonicalize, cse))' ~/repro.mlir

func.func @foo() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c320 = arith.constant 320 : index
  %c64 = arith.constant 64 : index
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x16x4xf16>
  %alloc = memref.alloc() : memref<1x64x4xf16, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<1x16x64xf16, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %4 = affine.min affine_map<()[s0] -> (s0 * -16 + 968, 16)>()[%workgroup_id_x]
  %subview = memref.subview %0[%workgroup_id_y, %3, 0] [1, %4, 320] [1, 1, 1] : memref<64x968x320xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x320xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %1[%workgroup_id_y, 0, 0] [1, 320, 4] [1, 1, 1] : memref<64x320x4xf16, #hal.descriptor_type<storage_buffer>> to memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %5 = scf.for %arg0 = %c0 to %c320 step %c64 iter_args(%arg1 = %cst_0) -> (vector<1x16x4xf16>) {
    %subview_4 = memref.subview %subview[0, 0, %arg0] [1, %4, 64] [1, 1, 1] : memref<1x?x320xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x16x64xf16>
    %7 = vector.transfer_read %subview_2[%c0, %arg0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x4xf16>
    gpu.barrier
    vector.transfer_write %6, %alloc_1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x16x64xf16>, memref<1x16x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %7, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x4xf16>, memref<1x64x4xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %8 = vector.transfer_read %alloc_1[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x16x64xf16, #gpu.address_space<workgroup>>, vector<1x16x64xf16>
    %9 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x4xf16, #gpu.address_space<workgroup>>, vector<1x64x4xf16>
    %10 = arith.extf %arg1 : vector<1x16x4xf16> to vector<1x16x4xf32>
    %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %9, %10 : vector<1x16x64xf16>, vector<1x64x4xf16> into vector<1x16x4xf32>
    %12 = arith.truncf %11 : vector<1x16x4xf32> to vector<1x16x4xf16>
    scf.yield %12 : vector<1x16x4xf16>
  }
  %subview_3 = memref.subview %2[%workgroup_id_y, %3, 0] [1, %4, 4] [1, 1, 1] : memref<64x968x4xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x4xf16, strided<[3872, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %5, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x16x4xf16>, memref<1x?x4xf16, strided<[3872, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_1 : memref<1x16x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x4xf16, #gpu.address_space<workgroup>>
  return
}

Error:

transfer '%7 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[309760, 320, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x16x64xf16>' vector layout: #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 2, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 8, 8], elements_per_thread = [1, 1, 8], subgroup_basis = [1, 1, 1], thread_basis = [1, 8, 8]>                     
transfer '%8 = vector.transfer_read %subview_2[%c0, %arg0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x320x4xf16, strided<[1280, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x4xf16>' vector layout: #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 64, 1], elements_per_thread = [1, 1, 4], subgroup_basis = [1, 1, 1], thread_basis = [1, 64, 1]>
/home/hanchung/z.mlir:31:10: error: Vector shape: [1, 64, 4] does not match the layout (nested_layout<subgroups_per_workgroup = [1, 1, 1], batches_per_subgroup = [1, 4, 0], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], subgroup_order = [0, 2, 1], subgroup_basis = [1, 1, 1, 1], subgroup_active_ids = [true, false, true, true], thread_basis = [1, 4, 16]>) at dim 2. Dimension expected by layout: 0 actual: 4
    %9 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x4xf16, #gpu.address_space<workgroup>>, vector<1x64x4xf16>
         ^
/home/hanchung/z.mlir:1:1: error: 'func.func' op failed to distribute

@qedawkins @Groverkss could you take a look?

Kunwar Grover · Answer 1 · Thu May 02 2024 08:16:30 GMT+0800 (China Standard Time)

batches_per_subgroup = [1, 4, 0]

Looks like the config setting the layout for contraction messed up there. I can have a look tomorrow and send a fix.

Han-Chung Wang · Answer 2 · Thu May 02 2024 08:23:50 GMT+0800 (China Standard Time)

Thanks!

FYI that this is generated with #17234 + #17264 We need to use llvm::divideCeil to compute the number of tiles. It is only done in GPUHeuristics.cpp, but not other places. I could miss updating logics and generate wrong subgroup_m_count and subgroup_n_count.

Quinn Dawkins · Answer 3 · Thu May 02 2024 08:49:49 GMT+0800 (China Standard Time)

This looks wrong to me

%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %9, %10 : vector<1x16x64xf16>, vector<1x64x4xf16> into vector<1x16x4xf32>

There is no intrinsic implemented for a 16x4 tile

Kunwar Grover · Answer 4 · Thu May 02 2024 19:42:45 GMT+0800 (China Standard Time)

Can you give me the original mlir file? I can at least add a failure when there is no intrinsic available for this shape.

Han-Chung Wang · Answer 5 · Fri May 03 2024 01:56:34 GMT+0800 (China Standard Time)

Here is the input IR:

hal.executable public @conv_2d_nchw_fchw_dispatch_1 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
    hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x968x4x320_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x968x4x320_f16xf16xf32() {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %c39649280 = arith.constant 39649280 : index
        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x320xf16>>
        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x320x4xf16>>
        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c39649280) : !flow.dispatch.tensor<writeonly:tensor<64x968x4xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 320], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x320xf16>> -> tensor<64x968x320xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 320, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x320x4xf16>> -> tensor<64x320x4xf16>
        %5 = tensor.empty() : tensor<64x968x4xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x968x4xf32>) -> tensor<64x968x4xf32>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x320xf16>, tensor<64x320x4xf16>) outs(%6 : tensor<64x968x4xf32>) -> tensor<64x968x4xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 4], strides = [1, 1, 1] : tensor<64x968x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x968x4xf32>>
        return
      }
    }
  }
}

However, do we need it to add a failure? I thought that all the information needed for vector distribution is in the previous snippet. The bug is in the tile size configuration, which is in my prototype?