NVIDIA / cuCollections

Is this a duplicate?

I confirmed there appear to be no duplicate issues for this bug (https://github.com/NVIDIA/cuCollections/issues)

Type of Bug

Something else

Describe the bug

Hash-based joins in cudf that use static_multimap's pair_retrieve are not racecheck clean: eg.

========= Warning: Race reported between Write access at 0x6db0 in /home/wence/Documents/src/rapids/cudf/cpp/build/cuda-11.8.0/branch-23.08/release/_deps/cuco-src/include/cuco/detail/static_multimap/kernels.cuh:500:void cuco::detail::pair_retrieve<(unsigned int)128, (unsigned int)32, (unsigned int)2, (unsigned int)96, (bool)1, thrust::transform_iterator<cudf::detail::make_pair_function<cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, thrust::zip_iterator<thrust::tuple<thrust::discard_iterator<thrust::use_default>, int *, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::zip_iterator<thrust::tuple<thrust::discard_iterator<thrust::use_default>, int *, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, cuda::__4::atomic<unsigned long, (cuda::std::__4::__detail::thread_scope)1>, cuco::static_multimap<unsigned int, int, (cuda::std::__4::__detail::thread_scope)1, rmm::mr::stream_allocator_adaptor<default_allocator<char>>, cuco::double_hashing<(unsigned int)2, cudf::hashing::detail::MurmurHash3_x86_32<unsigned int>, cudf::hashing::detail::MurmurHash3_x86_32<unsigned int>>>::device_view, cudf::detail::pair_equality<cudf::experimental::row::equality::strong_index_comparator_adapter<cudf::experimental::row::equality::device_row_comparator<(bool)0, cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>>>(T6, long, T7, T8, T9 *, T10, T11)
=========     and Read access at 0x25410 in /home/wence/Documents/src/rapids/cudf/cpp/build/cuda-11.8.0/branch-23.08/release/_deps/cuco-src/include/cuco/detail/static_multimap/kernels.cuh:536:void cuco::detail::pair_retrieve<(unsigned int)128, (unsigned int)32, (unsigned int)2, (unsigned int)96, (bool)1, thrust::transform_iterator<cudf::detail::make_pair_function<cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, thrust::zip_iterator<thrust::tuple<thrust::discard_iterator<thrust::use_default>, int *, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::zip_iterator<thrust::tuple<thrust::discard_iterator<thrust::use_default>, int *, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, cuda::__4::atomic<unsigned long, (cuda::std::__4::__detail::thread_scope)1>, cuco::static_multimap<unsigned int, int, (cuda::std::__4::__detail::thread_scope)1, rmm::mr::stream_allocator_adaptor<default_allocator<char>>, cuco::double_hashing<(unsigned int)2, cudf::hashing::detail::MurmurHash3_x86_32<unsigned int>, cudf::hashing::detail::MurmurHash3_x86_32<unsigned int>>>::device_view, cudf::detail::pair_equality<cudf::experimental::row::equality::strong_index_comparator_adapter<cudf::experimental::row::equality::device_row_comparator<(bool)0, cudf::nullate::DYNAMIC, cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>>>(T6, long, T7, T8, T9 *, T10, T11) [4 hazards]
=========

How to Reproduce

import cudf
df = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
inter = df.merge(df, on=["a", "b"], how="left")

compute-sanitizer --tool=racecheck python bug.py

I can try and build a cuco only reproducer but I am not very familiar with the API...

Expected behavior

No races.
Here's the bit of code that's being complained about:

cuCollections/include/cuco/detail/static_multimap/kernels.cuh

Lines 498 to 538 in a2833db

    
           __shared__ uint32_t flushing_cg_counter[num_flushing_cgs]; 
        
           if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } 
        
           while (flushing_cg.any(idx < n)) { 
        
             bool active_flag        = idx < n; 
        
             auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag); 
        
             if (active_flag) { 
        
               pair_type pair = *(first + idx); 
        
               if constexpr (is_outer) { 
        
                 view.pair_retrieve_outer<buffer_size>(active_flushing_cg, 
        
                                                       probing_cg, 
        
                                                       pair, 
        
                                                       &flushing_cg_counter[flushing_cg_id], 
        
                                                       probe_output_buffer[flushing_cg_id], 
        
                                                       contained_output_buffer[flushing_cg_id], 
        
                                                       num_matches, 
        
                                                       probe_output_begin, 
        
                                                       contained_output_begin, 
        
                                                       pair_equal); 
        
               } else { 
        
                 view.pair_retrieve<buffer_size>(active_flushing_cg, 
        
                                                 probing_cg, 
        
                                                 pair, 
        
                                                 &flushing_cg_counter[flushing_cg_id], 
        
                                                 probe_output_buffer[flushing_cg_id], 
        
                                                 contained_output_buffer[flushing_cg_id], 
        
                                                 num_matches, 
        
                                                 probe_output_begin, 
        
                                                 contained_output_begin, 
        
                                                 pair_equal); 
        
               } 
        
             } 
        
             idx += loop_stride; 
        
           } 
        
           // Final flush of output buffer 
        
           if (flushing_cg_counter[flushing_cg_id] > 0) { 
        
             view.flush_output_buffer(flushing_cg, 
        
                                      flushing_cg_counter[flushing_cg_id],

With a race conflict between lines 500 and 536, view.pair_retrieve syncs the CG [edit: only the smaller active_cg so I don't think that is enough anyway], but then writes to the __shared__ flushing_cg_counter without syncing afterwards. So we can exit the loop on line 534 potentially with an unsynced write. Equally, if the loop never executes, we will also not sync the CG. So plausibly

diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh
index ca5f898..1ffa170 100644
--- a/include/cuco/detail/static_multimap/kernels.cuh
+++ b/include/cuco/detail/static_multimap/kernels.cuh
@@ -532,6 +532,8 @@ __global__ void pair_retrieve(InputIt first,
     idx += loop_stride;
   }
 
+  flushing_cg.sync();
+
   // Final flush of output buffer
   if (flushing_cg_counter[flushing_cg_id] > 0) {
     view.flush_output_buffer(flushing_cg,

is enough.

Aside: I think there is also a potential race in the pair_retrieve function since if the trail of while(running) flushes the buffer, cg_counter is written to, but the CG is not synced before the next iteration's read (that's device_view_impl.inl lines 1068 vs 1096.

Reproduction link

No response

Operating System

No response

nvidia-smi output

No response

NVCC version

No response

	__shared__ uint32_t flushing_cg_counter[num_flushing_cgs];

	if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; }

	while (flushing_cg.any(idx < n)) {
	bool active_flag = idx < n;
	auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag);

	if (active_flag) {
	pair_type pair = *(first + idx);
	if constexpr (is_outer) {
	view.pair_retrieve_outer<buffer_size>(active_flushing_cg,
	probing_cg,
	pair,
	&flushing_cg_counter[flushing_cg_id],
	probe_output_buffer[flushing_cg_id],
	contained_output_buffer[flushing_cg_id],
	num_matches,
	probe_output_begin,
	contained_output_begin,
	pair_equal);
	} else {
	view.pair_retrieve<buffer_size>(active_flushing_cg,
	probing_cg,
	pair,
	&flushing_cg_counter[flushing_cg_id],
	probe_output_buffer[flushing_cg_id],
	contained_output_buffer[flushing_cg_id],
	num_matches,
	probe_output_begin,
	contained_output_begin,
	pair_equal);
	}
	}
	idx += loop_stride;
	}

	// Final flush of output buffer
	if (flushing_cg_counter[flushing_cg_id] > 0) {
	view.flush_output_buffer(flushing_cg,
	flushing_cg_counter[flushing_cg_id],

[BUG]: data race in static_multimap pair_retrieve

Is this a duplicate?

Type of Bug

Describe the bug

How to Reproduce

Expected behavior

Reproduction link

Operating System

nvidia-smi output

NVCC version