cjordan / hip-sys

Rust bindings for HIP

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Segfault when launching kernel

dengelt opened this issue · comments

I've been trying out this crate, and I can't get an example kernel to launch. I ported this example to Rust: https://github.com/ROCm/HIP/blob/master/samples/0_Intro/module_api/defaultDriver.cpp

This is what the code looks like:

use hip_runtime_sys::{
    hipCtxCreate, hipCtxDestroy, hipDeviceGet, hipDeviceptr_t, hipFree, hipFunction_t, hipInit,
    hipMalloc, hipMemcpyDtoH, hipMemcpyHtoD, hipModuleGetFunction, hipModuleLaunchKernel,
    hipModuleLoad, hipModule_t,
};

const LEN: usize = 64;
const SIZE: usize = LEN << 2;

fn main() {
    let mut a: Vec<f32> = vec![0.0; LEN];
    let mut b: Vec<f32> = vec![0.0; LEN];

    for i in 0..LEN {
        a[i] = (i as f32) * 1.0;
        b[i] = 0.0;
    }
    let file_name = std::ffi::CString::new("examples/vcpy_kernel.code").unwrap();
    let kernel_name = std::ffi::CString::new("hello_world").unwrap();

    unsafe {
        hipInit(0);
        let mut device = 0;
        hipDeviceGet(&mut device, 0);

        let mut context = std::ptr::null_mut();
        hipCtxCreate(&mut context, 0, device);

        let mut ad: hipDeviceptr_t = std::ptr::null_mut();
        let mut bd: hipDeviceptr_t = std::ptr::null_mut();
        hipMalloc(&mut ad, SIZE);
        hipMalloc(&mut bd, SIZE);

        hipMemcpyHtoD(ad, a.as_mut_ptr() as *mut std::ffi::c_void, SIZE);
        hipMemcpyHtoD(bd, b.as_mut_ptr() as *mut std::ffi::c_void, SIZE);

        let mut module: hipModule_t = std::ptr::null_mut();
        hipModuleLoad(&mut module, file_name.as_ptr());

        let mut function: hipFunction_t = std::ptr::null_mut();
        hipModuleGetFunction(&mut function, module, kernel_name.as_ptr());

        let mut args: [*mut std::ffi::c_void; 2] =
            [ad as *mut std::ffi::c_void, bd as *mut std::ffi::c_void];

        hipModuleLaunchKernel(
            function,
            1,
            1,
            1,
            LEN as u32,
            1,
            1,
            0,
            std::ptr::null_mut(),
            args.as_mut_ptr(),
            std::ptr::null_mut(),
        );

        hipMemcpyDtoH(b.as_mut_ptr() as *mut std::ffi::c_void, bd, SIZE);

        let mismatch_count = a.iter().zip(b.iter()).filter(|(a, b)| a != b).count();

        if mismatch_count == 0 {
            println!("PASSED!");
        } else {
            println!("FAILED!");
        }

        hipFree(ad);
        hipFree(bd);
        hipCtxDestroy(context);
    }
}

The linked C++ example runs fine for me, so I think my HIP/ROCm installation is not the issue. But the Rust version segfaults on calling hipModuleLaunchKernel, 4 calls into libamdhip64.so.5. I can't see where exactly, since I don't have debug symbols for libamdhip64.so.5.

The kernel is identical to the example (literally copied the compiled kernel), so that shouldn't be the issue. I thought maybe the problem is that the C++ example uses hipcc also to compile the host code, and hipcc sets some special clang flags, but compiling the host code of the C++ example with clang++ works just as well.

Does the Rust example, or any other Rust example, work for you?

I also recreated the same example in Python using pyhip, and that works too. So I do think this is some issue in the hip-sys bindings.

import ctypes
import numpy as np
from pyhip import hip
  
  
with open("/path/to/vcpy_kernel.code", "rb") as f:
    code = f.read()
  
module = hip.hipModuleLoadData(code)
kernel = hip.hipModuleGetFunction(module, 'hello_world')
  
LEN = 64
SIZE = LEN * 4
  
A = np.arange(LEN, dtype=np.float32)
B = np.zeros(LEN, dtype=np.float32)
  
ptr_A = hip.hipMalloc(SIZE)
ptr_B = hip.hipMalloc(SIZE)
  
hip.hipMemcpy_htod(ptr_A, A.ctypes.data, SIZE)
hip.hipMemcpy_htod(ptr_B, B.ctypes.data, SIZE)
  
# Define the structure to pack two pointers
class PackageStruct(ctypes.Structure):
    _fields_ = [("a", ctypes.c_void_p), ("b", ctypes.c_void_p)]
  
# Create an instance of the structure
struct = PackageStruct(ptr_A, ptr_B)
  
hip.hipModuleLaunchKernel(kernel, 1, 1, 1, LEN, 1, 1, 0, 0, struct)
  
hip.hipMemcpy_dtoh(B.ctypes.data, ptr_B, SIZE)
  
mismatch_count = np.sum(A != B)
  
if mismatch_count == 0:
    print("PASSED!")
else:
    print("FAILED!")
  
hip.hipFree(ptr_A)
hip.hipFree(ptr_B)

Modifying the example slightly so that the arguments to the kernel are passed in the more complicated "extra" parameter works for some reason:

use hip_runtime_sys::{
    hipCtxCreate, hipCtxDestroy, hipDeviceGet, hipDeviceptr_t, hipFree, hipFunction_t, hipInit,
    hipMalloc, hipMemcpyDtoH, hipMemcpyHtoD, hipModuleGetFunction, hipModuleLaunchKernel,
    hipModuleLoad, hipModule_t,
};

const LEN: usize = 64;
const SIZE: usize = LEN << 2;

fn main() {
    let mut a: Vec<f32> = vec![0.0; LEN];
    let mut b: Vec<f32> = vec![0.0; LEN];

    for i in 0..LEN {
        a[i] = (i as f32) * 1.0;
        b[i] = 0.0;
    }
    let file_name = std::ffi::CString::new("examples/vcpy_kernel.code").unwrap();
    let kernel_name = std::ffi::CString::new("hello_world").unwrap();

    unsafe {
        hipInit(0);
        let mut device = 0;
        hipDeviceGet(&mut device, 0);

        let mut context = std::ptr::null_mut();
        hipCtxCreate(&mut context, 0, device);

        let mut ad: hipDeviceptr_t = std::ptr::null_mut();
        let mut bd: hipDeviceptr_t = std::ptr::null_mut();
        hipMalloc(&mut ad, SIZE);
        hipMalloc(&mut bd, SIZE);

        hipMemcpyHtoD(ad, a.as_mut_ptr() as *mut std::ffi::c_void, SIZE);
        hipMemcpyHtoD(bd, b.as_mut_ptr() as *mut std::ffi::c_void, SIZE);

        let mut module: hipModule_t = std::ptr::null_mut();
        hipModuleLoad(&mut module, file_name.as_ptr());

        let mut function: hipFunction_t = std::ptr::null_mut();
        hipModuleGetFunction(&mut function, module, kernel_name.as_ptr());

        let mut args = Args {
            _ad: ad as *mut std::ffi::c_void,
            _bd: bd as *mut std::ffi::c_void,
        };

        let mut size = std::mem::size_of::<Args>();

        let mut config = [
            0x1 as *mut std::ffi::c_void,
            std::ptr::addr_of_mut!(args) as *mut std::ffi::c_void,
            0x2 as *mut std::ffi::c_void,
            std::ptr::addr_of_mut!(size) as *mut std::ffi::c_void,
            0x3 as *mut std::ffi::c_void,
        ];
        hipModuleLaunchKernel(
            function,
            1,
            1,
            1,
            LEN as u32,
            1,
            1,
            0,
            std::ptr::null_mut(),
            std::ptr::null_mut(),
            config.as_mut_ptr(),
        );

        hipMemcpyDtoH(b.as_mut_ptr() as *mut std::ffi::c_void, bd, SIZE);

        let mismatch_count = a.iter().zip(b.iter()).filter(|(a, b)| a != b).count();

        if mismatch_count == 0 {
            println!("PASSED!");
        } else {
            println!("FAILED!");
        }

        hipFree(ad);
        hipFree(bd);
        hipCtxDestroy(context);
    }
}
#[repr(C)]
struct Args {
    _ad: *mut std::ffi::c_void,
    _bd: *mut std::ffi::c_void,
}

Still no idea why the original code segfaults.

Hi @dengelt - sorry for this very delayed reply! I don't think GitHub was watching the repo for me, or something.

I'd like to help with this issue, but I'll need some time to think about it. I hope to get back to you in a weekend or two.

Hi again. I'm finally getting around to this, but unfortunately I haven't made progress understanding why your original example doesn't work. But, this code is used in production; see this for an example. (It's a bit dirty, but it does work).

I'll keep chipping away at this as I can.

Apologies again, but I don't know how to proceed with this. It could be an issue for rocm, but that doesn't seem likely, and I can't see any functional difference between the initial Rust example and the C++ example.

I did spot some curiosities, though. On my machine (5700XT GPU, rocm 6.0.2, up-to-date Arch linux), the Rust code segfaults in /usr/src/debug/hip-runtime-amd/clr-rocm-6.0.2/rocclr/platform/kernel.cpp:102, and I poked around in there with a debugger (rocgdb).

C++ modification:

    printf("%p %p %p\n", args, Ad, Bd);

C++ output:

0x7fffffffde50 0x7fffea200000 0x7fffea201000

Thread 1 "a.out" hit Breakpoint 1, amd::KernelParameters::set (this=0x55555597fd40, index=0, size=8, value=0x7fffffffde28, svmBound=true) at /usr/src/debug/hip-runtime-amd/clr-rocm-6.0.2/rocclr/platform/kernel.cpp:100

Rust modification:

        let mut args: [*mut std::ffi::c_void; 2] =
            [ad as *mut std::ffi::c_void, bd as *mut std::ffi::c_void];
        dbg!(&args);
        dbg!(args.as_mut_ptr());

Rust output:

[src/main.rs:75:9] &args = [
    0x00007fffea200000,
    0x00007fffea201000,
]
[src/main.rs:76:9] args.as_mut_ptr() = 0x00007fffffffd788

Thread 1 "issue-3" received signal SIGSEGV, Segmentation fault.
0x00007ffff6b0da6c in amd::KernelParameters::set (this=0x555555cf74b0, index=0, size=8, value=0x7fffea200000, svmBound=true)
    at /usr/src/debug/hip-runtime-amd/clr-rocm-6.0.2/rocclr/platform/kernel.cpp:102
102           LP64_SWITCH(uint32_value, uint64_value) = *(LP64_SWITCH(uint32_t*, uint64_t*))value;
(gdb)

Observations:

  • The device buffers are pretty consistently allocated to these addresses during my tinkering
  • In the C++ output, it looks like value passed to amd::KernelParameters::set is more similar to the pointer to the args than the device buffers (i.e. Ad or ad). But on the Rust side, it's almost like args gets de-referenced and the device buffers are used as value.
  • When running the debugger on the Rust code, attempting to use the witnessed C++ offsets for value on args makes the code progress, but a segfault still happens down the line.