ModernGPUv1
r-barnes opened this issue · comments
Richard Barnes commented
We'd like to compile some code using ModernGPUv1 with CUDA 9. But CUDA 9 raises a warning about using __shfl_up()
:
moderngpu/include/device/../device/intrinsics.cuh(113): warning: function "__shfl_up(float, unsigned int, int)"
moderngpu/include/device/../device/intrinsics.cuh(123): warning: function "__shfl_up(int, unsigned int, int)"
moderngpu/include/device/../device/intrinsics.cuh(124): warning: function "__shfl_up(int, unsigned int, int)"
/usr/include/sm_30_intrinsics.hpp(175): here was declared deprecated ("__shfl_up() is deprecated in favor of __shfl_up_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning).")
It suggests using __shfl_up_sync()
instead.
Nvidia warns that not to upgrade blindly:
Don’t just use FULL_MASK (i.e. 0xffffffff for 32 threads) as the mask value. If not all threads in the warp can reach the primitive according to the program logic, then using FULL_MASK may cause the program to hang.
Do you know if the 0xffffffff
(FULL_MASK
) mask is appropriate for ModernGPUv1? The ModernGPUv1 code raising the warnings is copied below for easy reference:
#pragma push_macro("__shfl_up")
#undef __shfl_up
__device__ __forceinline__ float shfl_up(float var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
var = __shfl_up(var, delta, width);
#endif
return var;
}
__device__ __forceinline__ double shfl_up(double var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
int2 p = mgpu::double_as_int2(var);
p.x = __shfl_up(p.x, delta, width);
p.y = __shfl_up(p.y, delta, width);
var = mgpu::int2_as_double(p);
#endif
return var;
}
#pragma pop_macro("__shfl_up")
Muhammad Osama commented