Hoist into global pass is producing different IRs on different runs of the same IR.
pashu123 opened this issue · comments
What happened?
Output IR 1:
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
util.initializer {
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<1x8xi32>
%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
util.global.store %1, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
util.return
}
util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
util.initializer {
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<1xi32>
%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1xi32>) -> tensor<1xi32>
util.global.store %1, @__hoisted_tensor_1xi32 : tensor<1xi32>
util.return
}
util.global private @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
util.global private @__hoisted_tensor_1xi32_1 : tensor<1xi32>
util.initializer {
%cst = arith.constant dense<1> : tensor<1x8xi32>
%__hoisted_tensor_1x8xi32 = util.global.load @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
%__hoisted_tensor_1xi32 = util.global.load @__hoisted_tensor_1xi32 : tensor<1xi32>
%0:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%__hoisted_tensor_1x8xi32, %__hoisted_tensor_1xi32 : tensor<1x8xi32>, tensor<1xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%1 = arith.addi %arg0, %arg1 : i32
iree_linalg_ext.yield %1 : i32
} -> tensor<1x8xi32>, tensor<1xi32>
util.global.store %0#0, @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
util.global.store %0#1, @__hoisted_tensor_1xi32_1 : tensor<1xi32>
util.return
}
util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
%__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
%__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
%__hoisted_tensor_1x8xi32_0 = util.global.load immutable @__hoisted_tensor_1x8xi32_0 : tensor<1x8xi32>
%__hoisted_tensor_1xi32_1 = util.global.load immutable @__hoisted_tensor_1xi32_1 : tensor<1xi32>
%0 = hal.tensor.export %__hoisted_tensor_1xi32_1 "output0" : tensor<1xi32> -> !hal.buffer_view
util.return %0 : !hal.buffer_view
}
}
Output IR 2:
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
util.global private @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
util.global private @__hoisted_tensor_1xi32 : tensor<1xi32>
util.initializer {
%cst = arith.constant dense<1> : tensor<1x8xi32>
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<1x8xi32>
%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
%2 = tensor.empty() : tensor<1xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
%4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%5 = arith.addi %arg0, %arg1 : i32
iree_linalg_ext.yield %5 : i32
} -> tensor<1x8xi32>, tensor<1xi32>
util.global.store %4#0, @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
util.global.store %4#1, @__hoisted_tensor_1xi32 : tensor<1xi32>
util.return
}
util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
%__hoisted_tensor_1x8xi32 = util.global.load immutable @__hoisted_tensor_1x8xi32 : tensor<1x8xi32>
%__hoisted_tensor_1xi32 = util.global.load immutable @__hoisted_tensor_1xi32 : tensor<1xi32>
%0 = hal.tensor.export %__hoisted_tensor_1xi32 "output0" : tensor<1xi32> -> !hal.buffer_view
util.return %0 : !hal.buffer_view
}
}
Output IR 1
successfully goes through iree-compile
passes and outputs valid .vmfb, whereas Output IR 2
doesn't.
Steps to reproduce your issue
Example MLIR.
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
util.func public @tm_tensor_scan(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @tm_tensor_scan(%input0: tensor<1x8xi32>) -> (%output0: tensor<1xi32>)"}} {
%cst = arith.constant dense<1> : tensor<1x8xi32>
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<1x8xi32>
%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x8xi32>) -> tensor<1x8xi32>
%2 = tensor.empty() : tensor<1xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1xi32>) -> tensor<1xi32>
%4:2 = iree_linalg_ext.scan dimension(1) inclusive(true) ins(%cst : tensor<1x8xi32>) outs(%1, %3 : tensor<1x8xi32>, tensor<1xi32>) {
^bb0(%arg1: i32, %arg2: i32):
%6 = arith.addi %arg1, %arg2 : i32
iree_linalg_ext.yield %6 : i32
} -> tensor<1x8xi32>, tensor<1xi32>
%5 = hal.tensor.export %4#1 "output0" : tensor<1xi32> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
Run : iree-opt -iree-util-hoist-into-globals above.mlir -mlir-disable-threading
Try 3-4 times to see different output IRs. The problem persists with -mlir-disable-threading
.
What component(s) does this issue relate to?
Compiler
Version information
No response
Additional context
No response
Both the IRs are correct; it's the implementation of how topological sort works.
@stellaraccident Any thoughts?
In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?
yeah and we definitely want output 2 - output 1 is silly
In general we want the compiler to be deterministic (especially with threading disabled). Were you able to find where in the code the nondeterminism is coming from?
Happening around this loop