[Codegen][LLVMGPU] TileAndDistributeToWorkgroups producing dynamic tiles given static inputs

Question

[Codegen][LLVMGPU] TileAndDistributeToWorkgroups producing dynamic tiles given static inputs

Max191 opened this issue 4 months ago · comments

The following IR fails to generate mfma instructions due to dynamic dimensions introduced in TileAndDistribute:

#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
module {
  func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #executable_target} {
    %cst = arith.constant 0.000000e+00 : f32
    %c129181184 = arith.constant 129181184 : index
    %c18112 = arith.constant 18112 : index
    %c100980224 = arith.constant 100980224 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [80, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<80x1280xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x1280xf16>
    %5 = tensor.empty() : tensor<80x1280xf32>
    %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<80x1280xf32>) -> tensor<80x1280xf32>
    %7 = linalg.matmul ins(%3, %4 : tensor<80x1280xf16>, tensor<1280x1280xf16>) outs(%6 : tensor<80x1280xf32>) -> tensor<80x1280xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [80, 1280], strides = [1, 1] : tensor<80x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
    return
  }
}

Run:

iree-opt --split-input-file --iree-codegen-llvmgpu-use-vector-distribution --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target, canonicalize))'

After TileAndDistributeToWorkgroups:

// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c129181184 = arith.constant 129181184 : index
  %c18112 = arith.constant 18112 : index
  %c100980224 = arith.constant 100980224 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]
  %4 = affine.min affine_map<()[s0] -> (s0 * -128 + 1280, 128)>()[%workgroup_id_x]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%5, 0], sizes = [%3, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<?x1280xf16>
  %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %8 = flow.dispatch.tensor.load %1, offsets = [0, %7], sizes = [1280, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x?xf16>
  %9 = tensor.empty(%3, %4) : tensor<?x?xf32>
  %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 128, 128]]>} ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 128, 128]]>} ins(%6, %8 : tensor<?x1280xf16>, tensor<1280x?xf16>) outs(%10 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %13 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  flow.dispatch.tensor.store %11, %2, offsets = [%12, %13], sizes = [%3, %4], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
  return
}

The M tile size becomes dynamic due to an affine_min, even though the tile should be known static at compile time:

%3 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]

Later, this fails to vectorize in GenericVectorization (I assume because of the dynamic dim), and results in no mfma ops later on:

// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
  %c128 = arith.constant 128 : index
  %c1280 = arith.constant 1280 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c129181184 = arith.constant 129181184 : index
  %c18112 = arith.constant 18112 : index
  %c100980224 = arith.constant 100980224 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]
  %6 = affine.min affine_map<()[s0] -> (s0 * -128 + 1280, 128)>()[%workgroup_id_x]
  %7 = flow.dispatch.tensor.load %2, offsets = [%3, %4], sizes = [%5, %6], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>> -> tensor<?x?xf32>
  %8 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%5, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<?x1280xf16>
  %9 = flow.dispatch.tensor.load %1, offsets = [0, %4], sizes = [1280, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x?xf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%7 : tensor<?x?xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<?x?xf32>
  %extracted_slice = tensor.extract_slice %10[0, 0] [%5, %6] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
  %11 = scf.for %arg0 = %c0 to %c1280 step %c128 iter_args(%arg1 = %extracted_slice) -> (tensor<?x?xf32>) {
    %extracted_slice_0 = tensor.extract_slice %8[0, %arg0] [%5, 128] [1, 1] : tensor<?x1280xf16> to tensor<?x128xf16>
    %extracted_slice_1 = tensor.extract_slice %9[%arg0, 0] [128, %6] [1, 1] : tensor<1280x?xf16> to tensor<128x?xf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<?x128xf16>, tensor<128x?xf16>) outs(%arg1 : tensor<?x?xf32>) {
    ^bb0(%in: f16, %in_2: f16, %out: f32):
      %13 = arith.extf %in : f16 to f32
      %14 = arith.extf %in_2 : f16 to f32
      %15 = arith.mulf %13, %14 : f32
      %16 = arith.addf %out, %15 : f32
      linalg.yield %16 : f32
    } -> tensor<?x?xf32>
    scf.yield %12 : tensor<?x?xf32>
  }
  %inserted_slice = tensor.insert_slice %11 into %10[0, 0] [%5, %6] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
  flow.dispatch.tensor.store %inserted_slice, %2, offsets = [%3, %4], sizes = [%5, %6], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
  return
}

Max191 · Answer 1 · Thu May 09 2024 06:00:11 GMT+0800 (China Standard Time)

There seems to actually be no bug here. The test is just broken, as it needs a hal.executable.variant for the number of workgroups to be inferred.