AnyDSL / thorin

The Higher-Order Intermediate Representation

Home Page:https://anydsl.github.io

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Code generation problem for while loops with if statements in the body

ergawy opened this issue · comments

This code fragment:

extern "device" {
    fn "llvm.nvvm.read.ptx.sreg.tid.x" nvvm_read_ptx_sreg_tid_x() -> i32;
    fn "llvm.nvvm.barrier0" nvvm_barrier0() -> ();
}

extern "thorin" {
    fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
    fn sizeof[T]() -> i32;
}

extern "C" {
    fn thorin_alloc(i32, i64) -> &i8;
}

struct Buffer {
    device: i32,
    data: &[i8]
}

fn alloc(dev: i32, size: i32) -> Buffer {
    Buffer {
        device: dev,
        data: thorin_alloc(dev, size as i64) as &[i8]
    }
}

fn thorin_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
fn alloc_cuda(dev: i32, size: i32) -> Buffer { alloc(thorin_device(1, dev), size) }

fn main() -> () {
  let buf = alloc_cuda(0, sizeof[i32]() * 57);
  let mut ptr = buf.data as &[i32];
  with nvvm(0, (2, 2, 2), (2, 2, 2)) {
    let mut id = 1;
    if nvvm_read_ptx_sreg_tid_x() == 0 {
      id = 5;
    }

    while (id < 57) {
      nvvm_barrier0();
      if nvvm_read_ptx_sreg_tid_x() == 0 {
        // here I update a global memory location that will ba later read by all other threads
        ptr(id) = 1;
        id += 2;
      }
      nvvm_barrier0();
    }
  }
}

Generates the following nvvm code:

; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"

define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda:
  %0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %next10.us, label %next10

next10.us:                                        ; preds = %lambda, %next10.us
  %id111.us = phi i32 [ %3, %next10.us ], [ 5, %lambda ]
  tail call ptx_device void @llvm.nvvm.barrier0()
  %2 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id111.us
  store i32 1, i32* %2
  %3 = add nsw i32 %id111.us, 2
  tail call ptx_device void @llvm.nvvm.barrier0()
  %4 = icmp slt i32 %3, 57
  br i1 %4, label %next10.us, label %next2

next2:                                            ; preds = %next10.us
  ret void

next10:                                           ; preds = %next10, %lambda
  tail call ptx_device void @llvm.nvvm.barrier0()
  tail call ptx_device void @llvm.nvvm.barrier0()
  br label %next10
}

declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)

; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}

!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}

The problem is that 2 versions of the while loop body is generated: 1 for thread 0 and one of the other threads. As a result, I get the expected result for thread 0 but all other threads produce wrong results.

In my case, thread 0 updates some device memory location which will be later read by other threads.

This happens only when the code is compiled with -O3 compiler option. If no optimization flag is passed the code is generated as expected:

; ModuleID = 'broken2'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"

define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda_crit_268_start:
  br label %lambda_crit_268

lambda_crit_268:                                  ; preds = %lambda_crit_268_start
  %0 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  br label %lambda

lambda:                                           ; preds = %lambda_crit_268
  %llvm.nvvm.read.ptx.sreg.tid.x = phi i32 [ %0, %lambda_crit_268 ]
  %1 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x, 0
  br i1 %1, label %if_then, label %if_else

if_else:                                          ; preds = %lambda
  br label %next

if_then:                                          ; preds = %lambda
  br label %next

next:                                             ; preds = %if_then, %if_else
  %id = phi i32 [ 5, %if_then ], [ 1, %if_else ]
  br label %while_head

while_head:                                       ; preds = %next10, %next
  %id1 = phi i32 [ %id, %next ], [ %id9, %next10 ]
  %2 = icmp slt i32 %id1, 57
  br i1 %2, label %while_body, label %next2

next2:                                            ; preds = %while_head
  ret void

while_body:                                       ; preds = %while_head
  call ptx_device void @llvm.nvvm.barrier0()
  br label %while_body3

while_body3:                                      ; preds = %while_body
  %3 = call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  br label %while_body4

while_body4:                                      ; preds = %while_body3
  %llvm.nvvm.read.ptx.sreg.tid.x5 = phi i32 [ %3, %while_body3 ]
  %4 = icmp eq i32 %llvm.nvvm.read.ptx.sreg.tid.x5, 0
  br i1 %4, label %if_then7, label %if_else6

if_else6:                                         ; preds = %while_body4
  br label %next8

if_then7:                                         ; preds = %while_body4
  %5 = getelementptr inbounds [0 x i32]* %_271_389, i64 0, i32 %id1
  store i32 1, i32* %5
  %6 = add nsw i32 2, %id1
  br label %next8

next8:                                            ; preds = %if_then7, %if_else6
  %id9 = phi i32 [ %id1, %if_else6 ], [ %6, %if_then7 ]
  call ptx_device void @llvm.nvvm.barrier0()
  br label %next10

next10:                                           ; preds = %next8
  br label %while_head
}

declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)

; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}

!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x i32]*)* @lambda_crit_268, metadata !"kernel", i64 1}

The generated code using -O3 looks correct:
LLVM applies loop unswitching and generates the two versions.
For threadIdx.x == 0 the code terminates, otherwise there is an infinite loop (also in the Impala implementation).

The problem here is that this optimization is performed by thorin (or by the LLVM backend of thorin). This should NOT happen since the barriers will be executed by only one thread (the thread lane 0), which is undefined behaviour (see http://stackoverflow.com/questions/12519573/cuda-syncthreads-inside-if-statements).

Just for the record: The optimization is done by LLVM. As soon as Thorin emits LLVM code all of Thorin's optimization passes have been performend. -O0, -O3, etc. is the optimization level of LLVM. Thorin only knows -Othorin to turn on optimizations. This is implied by -emit-llvm.

Yes, but we should prevent this from happening here. Maybe we could use a special flag in the LLVM IR that indicates that the barrier function has side effects, or has a behaviour that is incompatible with this optimization.

Where does this function decl come from?

; Function Attrs: nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

Is this the official way to declare a the nvvm barrier function?

This is actually a bug in LLVM 3.4: the barrier function requires the noduplicate attribute, which is missing from the IntrinsicsNVVM.td

Using LLVM 3.8, we get the following, correct code:

; ModuleID = 'test'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define ptx_kernel void @lambda_crit_268([0 x i32]* %_271_389) {
lambda:
  %0 = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %1 = icmp eq i32 %0, 0
  %. = select i1 %1, i32 5, i32 1
  br label %while_body4

next2:                                            ; preds = %next10
  ret void

while_body4:                                      ; preds = %lambda, %next10
  %id111 = phi i32 [ %., %lambda ], [ %id9, %next10 ]
  tail call ptx_device void @llvm.nvvm.barrier0()
  br i1 %1, label %if_then7, label %next10

if_then7:                                         ; preds = %while_body4
  %2 = sext i32 %id111 to i64
  %3 = getelementptr inbounds [0 x i32], [0 x i32]* %_271_389, i64 0, i64 %2
  store i32 1, i32* %3, align 4
  %4 = add nsw i32 %id111, 2
  br label %next10

next10:                                           ; preds = %if_then7, %while_body4
  %id9 = phi i32 [ %4, %if_then7 ], [ %id111, %while_body4 ]
  tail call ptx_device void @llvm.nvvm.barrier0()
  %5 = icmp slt i32 %id9, 57
  br i1 %5, label %while_body4, label %next2
}

; Function Attrs: nounwind readnone
declare i64 @llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*) #0

; Function Attrs: nounwind readnone
declare ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: noduplicate nounwind
declare ptx_device void @llvm.nvvm.barrier0() #1

attributes #0 = { nounwind readnone }
attributes #1 = { noduplicate nounwind }

!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}

!0 = !{i64 1, i64 2}
!1 = !{void ([0 x i32]*)* @lambda_crit_268, !"kernel", i64 1}

Btw, we will have the same problem for SPIR in 3.8.
In case we want to support SPIR in the future, we may want to annotate attributes in Impala, e.g.

extern "device" {
    fn "_Z7barrierj attribute(noduplicate)" barrier(i32) -> ();
}

I'm closing this issue.

  • Works for NVVM on master using LLVM 3.8
  • Broken for SPIR, but we will remove SPIR support: #59
  • Will work for amdgpu using LLVM 4.0: #66