coreylowman / cudarc

Safe rust wrapper around CUDA toolkit

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Best way to transfer structs with pointer fields between host and device?

Boscop opened this issue · comments

commented

Thanks for this crate, it's very useful!

I have a question:
What's the best way to transfer structs with pointer fields between host and device?

I have this situation:
In CUDA code, I use this struct:

typedef struct {
  Ptr   root;
  u32   blen;
  Wire* bags;
  Node* node;
  u32*  gidx;
  Wire* gmov;
  u32   pbks;
  u32   done;
  u32   rwts;
} Net;

which references these types:

typedef unsigned int u32;

typedef u32 Ptr;

typedef struct alignas(8) {
  Ptr ports[2];
} Node;

typedef struct alignas(8) {
  Ptr lft;
  Ptr rgt;
} Wire;

I need to transfer instances of Net between host and device.
What's the best way (using this crate), considering that Net has pointer (array) fields?

I'm basically trying to translate these functions from CUDA C++ to Rust/cudarc:

__host__ Net* net_to_device(Net* host_net) {
  // Allocate memory on the device for the Net object, and its data
  Net*  device_net;
  Wire* device_bags;
  u32*  device_gidx;
  Wire* device_gmov;
  Node* device_node;

  cudaMalloc((void**)&device_net, sizeof(Net));
  cudaMalloc((void**)&device_bags, BAGS_SIZE * sizeof(Wire));
  cudaMalloc((void**)&device_gidx, GIDX_SIZE * sizeof(u32));
  cudaMalloc((void**)&device_gmov, GMOV_SIZE * sizeof(Wire));
  cudaMalloc((void**)&device_node, NODE_SIZE * sizeof(Node));

  // Copy the host data to the device memory
  cudaMemcpy(device_bags, host_net->bags, BAGS_SIZE * sizeof(Wire), cudaMemcpyHostToDevice);
  cudaMemcpy(device_gidx, host_net->gidx, GIDX_SIZE * sizeof(u32),  cudaMemcpyHostToDevice);
  cudaMemcpy(device_gmov, host_net->gmov, GMOV_SIZE * sizeof(Wire), cudaMemcpyHostToDevice);
  cudaMemcpy(device_node, host_net->node, NODE_SIZE * sizeof(Node), cudaMemcpyHostToDevice);

  // Create a temporary host Net object with device pointers
  Net temp_net  = *host_net;
  temp_net.bags = device_bags;
  temp_net.gidx = device_gidx;
  temp_net.gmov = device_gmov;
  temp_net.node = device_node;

  // Copy the temporary host Net object to the device memory
  cudaMemcpy(device_net, &temp_net, sizeof(Net), cudaMemcpyHostToDevice);

  // Return the device pointer to the created Net object
  return device_net;
}

__host__ Net* net_to_host(Net* device_net) {
  // Create a new host Net object
  Net* host_net = (Net*)malloc(sizeof(Net));

  // Copy the device Net object to the host memory
  cudaMemcpy(host_net, device_net, sizeof(Net), cudaMemcpyDeviceToHost);

  // Allocate host memory for data
  host_net->bags = (Wire*)malloc(BAGS_SIZE * sizeof(Wire));
  host_net->gidx = (u32*) malloc(GIDX_SIZE * sizeof(u32));
  host_net->gmov = (Wire*)malloc(GMOV_SIZE * sizeof(Wire));
  host_net->node = (Node*)malloc(NODE_SIZE * sizeof(Node));

  // Retrieve the device pointers for data
  Wire* device_bags;
  u32*  device_gidx;
  Wire* device_gmov;
  Node* device_node;
  cudaMemcpy(&device_bags, &(device_net->bags), sizeof(Wire*), cudaMemcpyDeviceToHost);
  cudaMemcpy(&device_gidx, &(device_net->gidx), sizeof(u32*),  cudaMemcpyDeviceToHost);
  cudaMemcpy(&device_gmov, &(device_net->gmov), sizeof(Wire*), cudaMemcpyDeviceToHost);
  cudaMemcpy(&device_node, &(device_net->node), sizeof(Node*), cudaMemcpyDeviceToHost);

  // Copy the device data to the host memory
  cudaMemcpy(host_net->bags, device_bags, BAGS_SIZE * sizeof(Wire), cudaMemcpyDeviceToHost);
  cudaMemcpy(host_net->gidx, device_gidx, GIDX_SIZE * sizeof(u32),  cudaMemcpyDeviceToHost);
  cudaMemcpy(host_net->gmov, device_gmov, GMOV_SIZE * sizeof(Wire), cudaMemcpyDeviceToHost);
  cudaMemcpy(host_net->node, device_node, NODE_SIZE * sizeof(Node), cudaMemcpyDeviceToHost);

  return host_net;
}

I wrote it in Rust like this:

#[repr(C)]
pub struct Net {
	pub root: Ptr,
	pub blen: u32,

	// pub bags: *mut Wire,
	// pub node: *mut Node,
	// pub gidx: *mut u32, 
	// pub gmov: *mut Wire,
	pub bags: CUdeviceptr,
	pub node: CUdeviceptr,
	pub gidx: CUdeviceptr,
	pub gmov: CUdeviceptr,

	pub pbks: u32,
	pub done: u32,
	pub rwts: u32,
}
unsafe impl DeviceRepr for Net {}
unsafe impl ValidAsZeroBits for Net {}

// High-level type for Net
pub struct HlNet {
	pub root: Ptr,         
	pub blen: u32,         
	pub bags: Box<[Wire]>, 
	pub node: Box<[Node]>, 
	pub gidx: Box<[u32]>,  
	pub gmov: Box<[Wire]>, 
	pub pbks: u32,         
	pub done: u32,         
	pub rwts: u32,         
}

// This works
pub fn net_to_device(net: &HlNet, dev: &Arc<CudaDevice>) -> Result<CudaSlice<Net>, DriverError> {
	let device_bags = dev.htod_sync_copy(&net.bags)?;
	let device_node = dev.htod_sync_copy(&net.node)?;
	let device_gidx = dev.htod_sync_copy(&net.gidx)?;
	let device_gmov = dev.htod_sync_copy(&net.gmov)?;

	let temp_net = Net {
		root: net.root,
		blen: net.blen,
		bags: *(&device_bags).device_ptr() /* as _ */,
		node: *(&device_node).device_ptr() /* as _ */,
		gidx: *(&device_gidx).device_ptr() /* as _ */,
		gmov: *(&device_gmov).device_ptr() /* as _ */,
		pbks: net.pbks,
		done: net.done,
		rwts: net.rwts,
	};

	let device_net = dev.htod_sync_copy(slice::from_ref(&temp_net))?;
	Ok(device_net)
}

// This doesn't work
pub fn net_to_host(dev: &Arc<CudaDevice>, device_net: CudaSlice<Net>) -> Result<HlNet, DriverError> {
	use cudarc::driver::{result, sys::CUdeviceptr};

    // Modified versions that take a `CUdeviceptr` as src, because there seems to be no way to create a `CudaSlice` from a `CUdeviceptr`
	pub fn dtoh_sync_copy_into<T: DeviceRepr>(
		dev: &Arc<CudaDevice>,
		src: CUdeviceptr,
		src_len: usize,
		dst: &mut [T],
	) -> Result<(), DriverError> {
		assert_eq!(src_len, dst.len());
		dev.bind_to_thread()?;
		println!("memcpy_dtoh_sync: {:#?}", src);
		unsafe { result::memcpy_dtoh_sync(dst, src) }?; // This fails: DriverError(CUDA_ERROR_INVALID_VALUE, "invalid argument")
		println!("synchronize: {:#?}", src);
		dev.synchronize()
	}
	pub fn dtoh_sync_copy<T: DeviceRepr>(
		dev: &Arc<CudaDevice>,
		src: CUdeviceptr,
		src_len: usize,
	) -> Result<Vec<T>, DriverError> {
		let mut dst = Vec::with_capacity(src_len);
		unsafe { dst.set_len(src_len) };
		dtoh_sync_copy_into(dev, src, src_len, &mut dst)?;
		Ok(dst)
	}

	// Can't use `sync_reclaim` because `Net` isn't `Clone`, even though there's only one
	// let mut net_vec: Vec<Net> = dev.sync_reclaim(device_net)?;
	let mut net_vec = dev.dtoh_sync_copy(&device_net)?;
	let net = net_vec.remove(0);

	println!("net: {:#?}", net);

	let bags = dtoh_sync_copy(dev, net.bags /* as CUdeviceptr */, BAGS_SIZE as usize)?;
	let node = dtoh_sync_copy(dev, net.node /* as CUdeviceptr */, NODE_SIZE as usize)?;
	let gidx = dtoh_sync_copy(dev, net.gidx /* as CUdeviceptr */, GIDX_SIZE as usize)?;
	let gmov = dtoh_sync_copy(dev, net.gmov /* as CUdeviceptr */, GMOV_SIZE as usize)?;

	let net = HlNet {
		root: net.root,
		blen: net.blen,
		bags: bags.into_boxed_slice(),
		node: node.into_boxed_slice(),
		gidx: gidx.into_boxed_slice(),
		gmov: gmov.into_boxed_slice(),
		pbks: net.pbks,
		done: net.done,
		rwts: net.rwts,
	};
	Ok(net)
}

Reading back the net doesn't work. The program prints:

net: Net {
    root: 0,
    blen: 0,
    bags: 8663334912,
    node: 8797552640,
    gidx: 10945036288,
    gmov: 11012408320,
    pbks: 0,
    done: 0,
    rwts: 0,
}
memcpy_dtoh_sync: 8663334912
Error: DriverError(CUDA_ERROR_INVALID_VALUE, "invalid argument")

So it's failing at memcpy_dtoh_sync of the first field I read (bags).
Any idea why I'm getting the "invalid argument" error for the src pointer?

And what's the idiomatic/recommended way to read such a structure back from the GPU?
Thanks :)

What's happening is that you're accidentally dropping/freeing all of the device memory.

// This works
pub fn net_to_device(net: &HlNet, dev: &Arc<CudaDevice>) -> Result<CudaSlice<Net>, DriverError> {
	let device_bags = dev.htod_sync_copy(&net.bags)?;
	let device_node = dev.htod_sync_copy(&net.node)?;
	let device_gidx = dev.htod_sync_copy(&net.gidx)?;
	let device_gmov = dev.htod_sync_copy(&net.gmov)?;

        // All of the device variables above are moved into temp_net.
	let temp_net = Net {
		root: net.root,
		blen: net.blen,
		bags: *(&device_bags).device_ptr() /* as _ */,
		node: *(&device_node).device_ptr() /* as _ */,
		gidx: *(&device_gidx).device_ptr() /* as _ */,
		gmov: *(&device_gmov).device_ptr() /* as _ */,
		pbks: net.pbks,
		done: net.done,
		rwts: net.rwts,
	};
       // !!!
       // All of the device_* are dropped here!
       // !!!

        // Note: The inner device array's owned by temp_net are NOT moved
        // to device_net. 
	let device_net = dev.htod_sync_copy(slice::from_ref(&temp_net))?;
        // !!!
        // temp_net is dropped by here!
        // !!!
	Ok(device_net)
}

You can get your code to run by adding a std::mem::forget(...) or .leak() to all of your CuSlices. However, this is more of a bandaid than a real fix. It may be a good idea to come up with a systemic solution when one CuSlice should own another.

commented

Ah yes, thanks!

@Boscop did you figure out a more elegant solution to this ownerhsip cuslice problem than leaking?

@Boscop did you figure out a more elegant solution to this ownerhsip cuslice problem than leaking?

Yes, keeping a pointer to it alive in a returned struct and deallocating it in its drop impl.