iree-org / iree

A retargetable MLIR-based machine learning compiler and runtime toolkit.

Home Page:http://iree.dev/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Hoist into global pass is producing different IRs on different runs of the same IR.

pashu123 opened this issue · comments

What happened?

Output IR 1:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    util.global.store %1, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1xi32>) -> tensor<1xi32>
    util.global.store %1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.global private @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32_1 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %__hoisted_tensor_1x8xi32 = util.global.load @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%__hoisted_tensor_1x8xi32, %__hoisted_tensor_1xi32 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %1 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %1 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %0#0, @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    util.global.store %0#1, @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %__hoisted_tensor_1x8xi32_0 = util.global.load immutable @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32_1 = util.global.load immutable @__hoisted_tensor_1xi32_1 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32_1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 2:

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
  util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
  util.initializer {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %5 = arith.addi %arg0, %arg1 : i32
      iree_linalg_ext.yield %5 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    util.global.store %4#0, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    util.global.store %4#1, @__hoisted_tensor_1xi32 : tensor<1xi32>
    util.return
  }
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
    %__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
    %0 = hal.tensor.export %__hoisted_tensor_1xi32 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %0 : !hal.buffer_view
  }
}

Output IR 1 successfully goes through iree-compile passes and outputs valid .vmfb, whereas Output IR 2 doesn't.

Steps to reproduce your issue

Example MLIR.

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
    %cst = arith.constant dense<1> : tensor<1x8xi32>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<1x8xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
    %2 = tensor.empty() : tensor<1xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
    %4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
    ^bb0(%arg1: i32, %arg2: i32):
      %6 = arith.addi %arg1, %arg2 : i32
      iree_linalg_ext.yield %6 : i32
    } -> tensor<1x8xi32>, tensor<1xi32>
    %5 = hal.tensor.export %4#1 "output0" : tensor<1xi32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
} 

Run : iree-opt -iree-util-hoist-into-globals above.mlir -mlir-disable-threading

Try 3-4 times to see different output IRs. The problem persists with -mlir-disable-threading.

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

Both the IRs are correct; it's the implementation of how topological sort works.

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

yeah and we definitely want output 2 - output 1 is silly

In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?

Happening around this loop

for (Value constExprResult : iterOp->getResults()) {