coredump when use the iree-compile

Question

coredump when use the iree-compile

hunterzju opened this issue 5 months ago · comments

What happened?

iree-compile coredump when compile the following MLIR:

#map = affine_map<(d0, d1) -> (d0, d1)> 
#map1 = affine_map<(d0, d1) -> (d1, d0)>                                                                                                                                                      
#map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)>               
#map3 = affine_map<(d0, d1, d2) -> (d2)>                                                                                                                                                      
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>                                               
module attributes {torch.debug_module_name = "Linear"} {                                                                                                                                        ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>                                                                                                          func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> {                                                                                                                       
    %cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32>                               
    %cst_0 = arith.constant dense_resource<__elided__> : tensor<8x4xf32>                                                                                                                      
    %cst_1 = arith.constant 0.000000e+00 : f32                                                 
    %0 = tensor.empty() : tensor<4x8xf32>                                                                                                                                                     
    %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) {
    ^bb0(%in: f32, %out: f32):                                                                                                                                                                
      linalg.yield %in : f32                                                                   
    } -> tensor<4x8xf32>                                                                                                                                                                      
    %2 = tensor.empty() : tensor<32x1x8xf32>                                                                                                                                                      %3 = affine.for %arg1 = 0 to 32 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) {                                                                                                                 %extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32>                                                                      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32> 
      %5 = tensor.empty() : tensor<1x8xf32>                                                                                                                                                         %6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32>                                                             
      affine.yield %inserted_slice : tensor<32x1x8xf32>                                                                                                                                       
    }                                                                                                                                                                                         
    %4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1
x8xf32>) {                                                                                                                                                                                        ^bb0(%in: f32, %in_2: f32, %out: f32):                                                                                                                                                          %5 = arith.addf %in, %in_2 : f32                                                                                                                                                              linalg.yield %5 : f32
    } -> tensor<32x1x8xf32>                                                                    
    return %4 : tensor<32x1x8xf32>                                                                                                                                                            
  }                                                                                                                                                                                           
}

Steps to reproduce your issue

run the iree-compile with the following command:
iree-compile /tmp/ireedumps/core-input.mlir --iree-input-type=tm_tensor --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-embedded-linker-path=iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=/tmp/ireedumps/core-reproducer.mlir --mlir-print-ir-after=iree-vm-ordinal-allocation

What component(s) does this issue relate to?

Compiler

Version information

Iree git hash: f29895e

Additional context

The coredump files attached here:

core-command-line.txt
core-input.mlir
core-reproducer.mlir
crash_files.tgz.zip

Quinn Dawkins · Answer 1 · Thu Dec 07 2023 22:58:08 GMT+0800 (China Standard Time)

This should fix your issue with DropUnitDims: llvm/llvm-project#74723

But you'll still hit some failures because support for affine.for has not been plumbed through yet. In the short term you can try using scf.for instead.

#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
#map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module attributes {torch.debug_module_name = "Linear"} {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32>
    %cst_0 = util.unfoldable_constant dense<0.0> : tensor<8x4xf32>
    %cst_1 = arith.constant 0.000000e+00 : f32 
    %0 = tensor.empty() : tensor<4x8xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32 
    } -> tensor<4x8xf32>
    %2 = tensor.empty() : tensor<32x1x8xf32>
    %3 = scf.for %arg1 = %c0 to %c32 step %c1 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) {
      %extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32>
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32>
      %5 = tensor.empty() : tensor<1x8xf32>
      %6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32>
      scf.yield %inserted_slice : tensor<32x1x8xf32>
    }   
    %4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1x8xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %5 = arith.addf %in, %in_2 : f32 
      linalg.yield %5 : f32 
    } -> tensor<32x1x8xf32>
    return %4 : tensor<32x1x8xf32>
  }
}

hunterzju · Answer 2 · Fri Dec 08 2023 20:12:56 GMT+0800 (China Standard Time)

This should fix your issue with DropUnitDims: llvm/llvm-project#74723

But you'll still hit some failures because support for affine.for has not been plumbed through yet. In the short term you can try using scf.for instead.

#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
#map2 = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module attributes {torch.debug_module_name = "Linear"} {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @forward(%arg0: tensor<32x1x4xf32>) -> tensor<32x1x8xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant dense<[-0.0214709044, 0.302955747, 0.148901105, 0.256956697, 0.423214197, -0.466383755, -0.0912125706, -0.380575836]> : tensor<8xf32>
    %cst_0 = util.unfoldable_constant dense<0.0> : tensor<8x4xf32>
    %cst_1 = arith.constant 0.000000e+00 : f32 
    %0 = tensor.empty() : tensor<4x8xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<8x4xf32>) outs(%0 : tensor<4x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32 
    } -> tensor<4x8xf32>
    %2 = tensor.empty() : tensor<32x1x8xf32>
    %3 = scf.for %arg1 = %c0 to %c32 step %c1 iter_args(%arg2 = %2) -> (tensor<32x1x8xf32>) {
      %extracted_slice = tensor.extract_slice %arg0[%arg1, 1, 1] [1, 1, 4] [1, 1, 1] : tensor<32x1x4xf32> to tensor<1x1x4xf32>
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> to tensor<1x4xf32>
      %5 = tensor.empty() : tensor<1x8xf32>
      %6 = linalg.fill ins(%cst_1 : f32) outs(%5 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %7 = linalg.matmul ins(%extracted_slice_2, %1 : tensor<1x4xf32>, tensor<4x8xf32>) outs(%6 : tensor<1x8xf32>) -> tensor<1x8xf32>
      %inserted_slice = tensor.insert_slice %7 into %2[%arg1, 1, 1] [1, 1, 8] [1, 1, 1] : tensor<1x8xf32> into tensor<32x1x8xf32>
      scf.yield %inserted_slice : tensor<32x1x8xf32>
    }   
    %4 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %cst : tensor<32x1x8xf32>, tensor<8xf32>) outs(%2 : tensor<32x1x8xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %5 = arith.addf %in, %in_2 : f32 
      linalg.yield %5 : f32 
    } -> tensor<32x1x8xf32>
    return %4 : tensor<32x1x8xf32>
  }
}

The merge request works, thank you!