Code generation problem for while loops with if statements in the body
ergawy opened this issue · comments
This code fragment:
extern "device" {
fn "llvm.nvvm.read.ptx.sreg.tid.x" nvvm_read_ptx_sreg_tid_x() -> i32;
fn "llvm.nvvm.barrier0" nvvm_barrier0() -> ();
}
extern "thorin" {
fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
fn sizeof[T]() -> i32;
}
extern "C" {
fn thorin_alloc(i32, i64) -> &i8;
}
struct Buffer {
device: i32,
data: &[i8]
}
fn alloc(dev: i32, size: i32) -> Buffer {
Buffer {
device: dev,
data: thorin_alloc(dev, size as i64) as &[i8]
}
}
fn thorin_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
fn alloc_cuda(dev: i32, size: i32) -> Buffer { alloc(thorin_device(1, dev), size) }
fn main() -> () {
let buf = alloc_cuda(0, sizeof[i32]() * 57);
let mut ptr = buf.data as &[i32];
with nvvm(0, (2, 2, 2), (2, 2, 2)) {
let mut id = 1;
if nvvm_read_ptx_sreg_tid_x() == 0 {
id = 5;
}
while (id < 57) {
nvvm_barrier0();
if nvvm_read_ptx_sreg_tid_x() == 0 {
// here I update a global memory location that will ba later read by all other threads
ptr(id) = 1;
id += 2;
}
nvvm_barrier0();
}
}
}
Generates the following nvvm code:
; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda:
%0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%1 = icmp eq i32 %0, 0
br i1 %1, label %next10.us, label %next10
next10.us: ; preds = %lambda, %next10.us
%id111.us = phi i32 [ %3, %next10.us ], [ 5, %lambda ]
tail call ptx_device void @llvm.nvvm.barrier0()
%2 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id111.us
store i32 1, i32* %2
%3 = add nsw i32 %id111.us, 2
tail call ptx_device void @llvm.nvvm.barrier0()
%4 = icmp slt i32 %3, 57
br i1 %4, label %next10.us, label %next2
next2: ; preds = %next10.us
ret void
next10: ; preds = %next10, %lambda
tail call ptx_device void @llvm.nvvm.barrier0()
tail call ptx_device void @llvm.nvvm.barrier0()
br label %next10
}
declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}
The problem is that 2 versions of the while loop body is generated: 1 for thread 0 and one of the other threads. As a result, I get the expected result for thread 0 but all other threads produce wrong results.
In my case, thread 0 updates some device memory location which will be later read by other threads.
This happens only when the code is compiled with -O3 compiler option. If no optimization flag is passed the code is generated as expected:
; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda_crit_268_start:
br label %lambda_crit_268
lambda_crit_268: ; preds = %lambda_crit_268_start
%0 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
br label %lambda
lambda: ; preds = %lambda_crit_268
%llvm.nvvm.read.ptx.sreg.tid.x = phi i32 [ %0, %lambda_crit_268 ]
%1 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x, 0
br i1 %1, label %if_then, label %if_else
if_else: ; preds = %lambda
br label %next
if_then: ; preds = %lambda
br label %next
next: ; preds = %if_then, %if_else
%id = phi i32 [ 5, %if_then ], [ 1, %if_else ]
br label %while_head
while_head: ; preds = %next10, %next
%id1 = phi i32 [ %id, %next ], [ %id9, %next10 ]
%2 = icmp slt i32 %id1, 57
br i1 %2, label %while_body, label %next2
next2: ; preds = %while_head
ret void
while_body: ; preds = %while_head
call ptx_device void @llvm.nvvm.barrier0()
br label %while_body3
while_body3: ; preds = %while_body
%3 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
br label %while_body4
while_body4: ; preds = %while_body3
%llvm.nvvm.read.ptx.sreg.tid.x5 = phi i32 [ %3, %while_body3 ]
%4 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x5, 0
br i1 %4, label %if_then7, label %if_else6
if_else6: ; preds = %while_body4
br label %next8
if_then7: ; preds = %while_body4
%5 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id1
store i32 1, i32* %5
%6 = add nsw i32 2, %id1
br label %next8
next8: ; preds = %if_then7, %if_else6
%id9 = phi i32 [ %id1, %if_else6 ], [ %6, %if_then7 ]
call ptx_device void @llvm.nvvm.barrier0()
br label %next10
next10: ; preds = %next8
br label %while_head
}
declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}
The generated code using -O3 looks correct:
LLVM applies loop unswitching and generates the two versions.
For threadIdx.x == 0 the code terminates, otherwise there is an infinite loop (also in the Impala implementation).
The problem here is that this optimization is performed by thorin (or by the LLVM backend of thorin). This should NOT happen since the barriers will be executed by only one thread (the thread lane 0), which is undefined behaviour (see http://stackoverflow.com/questions/12519573/cuda-syncthreads-inside-if-statements).
Just for the record: The optimization is done by LLVM. As soon as Thorin emits LLVM code all of Thorin's optimization passes have been performend. -O0
, -O3
, etc. is the optimization level of LLVM. Thorin only knows -Othorin
to turn on optimizations. This is implied by -emit-llvm
.
Yes, but we should prevent this from happening here. Maybe we could use a special flag in the LLVM IR that indicates that the barrier
function has side effects, or has a behaviour that is incompatible with this optimization.
Where does this function decl come from?
; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1
Is this the official way to declare a the nvvm barrier function?
This is actually a bug in LLVM 3.4: the barrier function requires the noduplicate
attribute, which is missing from the IntrinsicsNVVM.td
Using LLVM 3.8, we get the following, correct code:
; ModuleID = 'test'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda:
%0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%1 = icmp eq i32 %0, 0
%. = select i1 %1, i32 5, i32 1
br label %while_body4
next2: ; preds = %next10
ret void
while_body4: ; preds = %lambda, %next10
%id111 = phi i32 [ %., %lambda ], [ %id9, %next10 ]
tail call ptx_device void @llvm.nvvm.barrier0()
br i1 %1, label %if_then7, label %next10
if_then7: ; preds = %while_body4
%2 = sext i32 %id111 to i64
%3 = getelementptr inbounds [0 x i32], [0 x i32]* %_271_389, i64 0, i64 %2
store i32 1, i32* %3, align 4
%4 = add nsw i32 %id111, 2
br label %next10
next10: ; preds = %if_then7, %while_body4
%id9 = phi i32 [ %4, %if_then7 ], [ %id111, %while_body4 ]
tail call ptx_device void @llvm.nvvm.barrier0()
%5 = icmp slt i32 %id9, 57
br i1 %5, label %while_body4, label %next2
}
; Function Attrs: nounwind readnone
declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*) #0
; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: noduplicate nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1
attributes #0 = { nounwind readnone }
attributes #1 = { noduplicate nounwind }
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = !{i64 1, i64 2}
!1 = !{void ([0 x i32]*)* @lambda_crit_268, !"kernel", i64 1}
Btw, we will have the same problem for SPIR in 3.8.
In case we want to support SPIR in the future, we may want to annotate attributes in Impala, e.g.
extern "device" {
fn "_Z7barrierj attribute(noduplicate)" barrier(i32) -> ();
}