coredump when use the iree-compile
hunterzju opened this issue · comments
What happened?
iree-compile coredump when compile the following MLIR:
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
#map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module attributes {torch.debug_module_name = "Linear"} { ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64> func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> {
%cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32>
%cst_0 = arith.constant dense_resource<__elided__> : tensor<8x4xf32>
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<4x8xf32>
%1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<4x8xf32>
%2 = tensor.empty() : tensor<32x1x8xf32> %3 = affine.for %arg1 = 0 to 32 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) { %extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32> %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32>
%5 = tensor.empty() : tensor<1x8xf32> %6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32>
%7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32>
%inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32>
affine.yield %inserted_slice : tensor<32x1x8xf32>
}
%4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1
x8xf32>) { ^bb0(%in: f32, %in_2: f32, %out: f32): %5 = arith.addf %in, %in_2 : f32 linalg.yield %5 : f32
} -> tensor<32x1x8xf32>
return %4 : tensor<32x1x8xf32>
}
}
Steps to reproduce your issue
run the iree-compile with the following command:
iree-compile /tmp/ireedumps/core-input.mlir --iree-input-type=tm_tensor --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-embedded-linker-path=iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=/tmp/ireedumps/core-reproducer.mlir --mlir-print-ir-after=iree-vm-ordinal-allocation
What component(s) does this issue relate to?
Compiler
Version information
Iree git hash: f29895e
Additional context
The coredump files attached here:
- core-command-line.txt
- core-input.mlir
- core-reproducer.mlir
crash_files.tgz.zip
This should fix your issue with DropUnitDims
: llvm/llvm-project#74723
But you'll still hit some failures because support for affine.for
has not been plumbed through yet. In the short term you can try using scf.for
instead.
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
#map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module attributes {torch.debug_module_name = "Linear"} {
ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32>
%cst_0 = util.unfoldable_constant dense<0.0> : tensor<8x4xf32>
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<4x8xf32>
%1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<4x8xf32>
%2 = tensor.empty() : tensor<32x1x8xf32>
%3 = scf.for %arg1 = %c0 to %c32 step %c1 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) {
%extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32>
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32>
%5 = tensor.empty() : tensor<1x8xf32>
%6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32>
%7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32>
%inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32>
scf.yield %inserted_slice : tensor<32x1x8xf32>
}
%4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1x8xf32>) {
^bb0(%in: f32, %in_2: f32, %out: f32):
%5 = arith.addf %in, %in_2 : f32
linalg.yield %5 : f32
} -> tensor<32x1x8xf32>
return %4 : tensor<32x1x8xf32>
}
}
This should fix your issue with
DropUnitDims
: llvm/llvm-project#74723But you'll still hit some failures because support for
affine.for
has not been plumbed through yet. In the short term you can try usingscf.for
instead.#map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1, d0)> #map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d2)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> module attributes {torch.debug_module_name = "Linear"} { ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64> func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index %cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32> %cst_0 = util.unfoldable_constant dense<0.0> : tensor<8x4xf32> %cst_1 = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<4x8xf32> %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } -> tensor<4x8xf32> %2 = tensor.empty() : tensor<32x1x8xf32> %3 = scf.for %arg1 = %c0 to %c32 step %c1 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) { %extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32> %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32> %5 = tensor.empty() : tensor<1x8xf32> %6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32> %7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32> %inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32> scf.yield %inserted_slice : tensor<32x1x8xf32> } %4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1x8xf32>) { ^bb0(%in: f32, %in_2: f32, %out: f32): %5 = arith.addf %in, %in_2 : f32 linalg.yield %5 : f32 } -> tensor<32x1x8xf32> return %4 : tensor<32x1x8xf32> } }
The merge request works, thank you!