[Codegen][LLVMGPU] TileAndDistributeToWorkgroups producing dynamic tiles given static inputs
Max191 opened this issue · comments
The following IR fails to generate mfma instructions due to dynamic dimensions introduced in TileAndDistribute:
#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
module {
func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #executable_target} {
%cst = arith.constant 0.000000e+00 : f32
%c129181184 = arith.constant 129181184 : index
%c18112 = arith.constant 18112 : index
%c100980224 = arith.constant 100980224 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [80, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<80x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x1280xf16>
%5 = tensor.empty() : tensor<80x1280xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<80x1280xf32>) -> tensor<80x1280xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<80x1280xf16>, tensor<1280x1280xf16>) outs(%6 : tensor<80x1280xf32>) -> tensor<80x1280xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [80, 1280], strides = [1, 1] : tensor<80x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
return
}
}
Run:
iree-opt --split-input-file --iree-codegen-llvmgpu-use-vector-distribution --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target, canonicalize))'
After TileAndDistributeToWorkgroups:
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c129181184 = arith.constant 129181184 : index
%c18112 = arith.constant 18112 : index
%c100980224 = arith.constant 100980224 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]
%4 = affine.min affine_map<()[s0] -> (s0 * -128 + 1280, 128)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%5, 0], sizes = [%3, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<?x1280xf16>
%7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %1, offsets = [0, %7], sizes = [1280, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x?xf16>
%9 = tensor.empty(%3, %4) : tensor<?x?xf32>
%10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 128, 128]]>} ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 128, 128]]>} ins(%6, %8 : tensor<?x1280xf16>, tensor<1280x?xf16>) outs(%10 : tensor<?x?xf32>) -> tensor<?x?xf32>
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%13 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
flow.dispatch.tensor.store %11, %2, offsets = [%12, %13], sizes = [%3, %4], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
return
}
The M
tile size becomes dynamic due to an affine_min, even though the tile should be known static at compile time:
%3 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]
Later, this fails to vectorize in GenericVectorization (I assume because of the dynamic dim), and results in no mfma ops later on:
// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @fit_shared_memory_schedule_matmul() attributes {hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
%c128 = arith.constant 128 : index
%c1280 = arith.constant 1280 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c129181184 = arith.constant 129181184 : index
%c18112 = arith.constant 18112 : index
%c100980224 = arith.constant 100980224 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -16 + 80, 16)>()[%workgroup_id_y]
%6 = affine.min affine_map<()[s0] -> (s0 * -128 + 1280, 128)>()[%workgroup_id_x]
%7 = flow.dispatch.tensor.load %2, offsets = [%3, %4], sizes = [%5, %6], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>> -> tensor<?x?xf32>
%8 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%5, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<80x1280xf16>> -> tensor<?x1280xf16>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %4], sizes = [1280, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf16>> -> tensor<1280x?xf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%7 : tensor<?x?xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %10[0, 0] [%5, %6] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%11 = scf.for %arg0 = %c0 to %c1280 step %c128 iter_args(%arg1 = %extracted_slice) -> (tensor<?x?xf32>) {
%extracted_slice_0 = tensor.extract_slice %8[0, %arg0] [%5, 128] [1, 1] : tensor<?x1280xf16> to tensor<?x128xf16>
%extracted_slice_1 = tensor.extract_slice %9[%arg0, 0] [128, %6] [1, 1] : tensor<1280x?xf16> to tensor<128x?xf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<?x128xf16>, tensor<128x?xf16>) outs(%arg1 : tensor<?x?xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%13 = arith.extf %in : f16 to f32
%14 = arith.extf %in_2 : f16 to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
scf.yield %12 : tensor<?x?xf32>
}
%inserted_slice = tensor.insert_slice %11 into %10[0, 0] [%5, %6] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
flow.dispatch.tensor.store %inserted_slice, %2, offsets = [%3, %4], sizes = [%5, %6], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<80x1280xf32>>
return
}
There seems to actually be no bug here. The test is just broken, as it needs a hal.executable.variant
for the number of workgroups to be inferred.