iree-org / iree

What happened?

Output IR 1:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    util.global.store %1, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1xi32>) -> tensor<1xi32>
    util.global.store %1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32_1 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %__hoisted_tensor_1x8xi32 = util.global.load @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%__hoisted_tensor_1x8xi32, %__hoisted_tensor_1xi32 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %1 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %1 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %0#0, @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    util.global.store %0#1, @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %__hoisted_tensor_1x8xi32_0 = util.global.load immutable @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32_1 = util.global.load immutable @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32_1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 2:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %5 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %5 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %4#0, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.global.store %4#1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 1 successfully goes through iree-compile passes and outputs valid .vmfb, whereas Output IR 2 doesn't.

Steps to reproduce your issue

Example MLIR.

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg1: i32, %arg2: i32):
      %6 = arith.addi %arg1, %arg2 : i32
      iree_linalg_ext.yield %6 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    %5 = hal.tensor.export %4#1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}

Run : iree-opt -iree-util-hoist-into-globals above.mlir -mlir-disable-threading

Try 3-4 times to see different output IRs. The problem persists with -mlir-disable-threading.

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

Both the IRs are correct; it's the implementation of how topological sort works.

@stellaraccident Any thoughts?

#17441

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

yeah and we definitely want output 2 - output 1 is silly

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

Happening around this loop

iree/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp

Line 103 in 26e4c6b

for (Value constExprResult : iterOp->getResults()) {

Hoist into global pass is producing different IRs on different runs of the same IR.

What happened?

Steps to reproduce your issue

What component(s) does this issue relate to?

Version information

Additional context