moderngpu / moderngpu

Patterns and behaviors for GPU computing

Home Page:http://moderngpu.github.io/moderngpu

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Error in FindSetPartitions

ctcyang opened this issue · comments

moderngpu git tag: V1.1
Ubuntu 14.04.5 LTS (GNU/Linux 3.13.0-106-generic x86_64)
gcc-4.8.real (Ubuntu 4.8.5-2ubuntu1~14.04.1) 4.8.5
Cuda compilation tools, release 8.0, V8.0.44


I noticed that FindSetPartitions() generates an error whenever the number of elements is bigger than one block. For example, using FindSetPartitions() on two sorted arrays A and B of length 591 and 584 respectively I am getting:

0:       0 -2147483198    591

when the expected result should match that of MergePathPartitions():

0:       0    449    591

Minimum working example:

#include "kernels/sets.cuh"
#include "kernels/intervalmove.cuh"
#include <algorithm>

using namespace mgpu;

template<typename T>
__global__ void setZero( T *keys, const int total ) {
    for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
            gridDim.x ) {
        keys[idx] = 0;
    }
}

template<typename T>
__global__ void lookRight( const T *keys, T *mark, const int total ) {
	for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
			gridDim.x ) {
		if( keys[idx+1]!=keys[idx] ) mark[idx] = 1;
		else mark[idx] = 0;
	}
}

template<typename T>
__global__ void streamCompact( const T *mark, const T *scan, T *out, const int total ) {
	for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
			gridDim.x )
		if( mark[idx] ) out[scan[idx]] = idx;
}

template<typename T>
void BenchmarkSetsPairs(int count, int numIt, CudaContext& context) {

	int aCount = count / 2;		 // number of A elements
	int bCount = count - aCount; // number of B elements

	MGPU_MEM(T) aKeys = context.SortRandom<T>(aCount, 0, count);
	MGPU_MEM(T) bKeys = context.SortRandom<T>(bCount, 0, count);
	MGPU_MEM(T) aValues = context.FillAscending<T>(aCount, 0, 1);
	MGPU_MEM(T) bValues = context.FillAscending<T>(bCount, 0, 1);

	MGPU_MEM(T) aMark = context.Malloc<T>(aCount);
	MGPU_MEM(T) bMark = context.Malloc<T>(bCount);
	MGPU_MEM(T) aScan = context.Malloc<T>(aCount);
	MGPU_MEM(T) bScan = context.Malloc<T>(bCount);

	const int NTHREADS = 128;
	const int NBLOCKS = 480;

    setZero<<<NBLOCKS,NTHREADS>>>( aMark->get(), aCount );
    setZero<<<NBLOCKS,NTHREADS>>>( bMark->get(), bCount );
    lookRight<<<NBLOCKS,NTHREADS>>>( aKeys->get(), aMark->get(), aCount-1 );
    lookRight<<<NBLOCKS,NTHREADS>>>( bKeys->get(), bMark->get(), bCount-1 );

	int aCountDupFree = 0;	
	int bCountDupFree = 0;
	// Only do scan for first aCount-1 since last one isn't defined	
	Scan<MgpuScanTypeExc>( aMark->get(), aCount-1, 0, plus<int>(), (int*)0, 
		&aCountDupFree, aScan->get(), context );
	Scan<MgpuScanTypeExc>( bMark->get(), bCount-1, 0, plus<int>(), (int*)0, 
		&bCountDupFree, bScan->get(), context );

	MGPU_MEM(T) aInd  = context.Malloc<T>(aCountDupFree);
	MGPU_MEM(T) bInd  = context.Malloc<T>(bCountDupFree);
	MGPU_MEM(T) aKeysUnique  = context.Malloc<T>(aCountDupFree);
	MGPU_MEM(T) bKeysUnique  = context.Malloc<T>(bCountDupFree);

	streamCompact<<<NBLOCKS,NTHREADS>>>( aMark->get(), aScan->get(), 
		aInd->get(), aCount );
	streamCompact<<<NBLOCKS,NTHREADS>>>( bMark->get(), bScan->get(), 
		bInd->get(), bCount );
	IntervalGather( aCountDupFree, aInd->get(), counting_iterator<int>(0),
		aCountDupFree, aKeys->get(), aKeysUnique->get(), context );
	IntervalGather( bCountDupFree, bInd->get(), counting_iterator<int>(0),
		bCountDupFree, bKeys->get(), bKeysUnique->get(), context );

	std::vector<T> aMarkHost, bMarkHost;
	std::vector<T> aScanHost, bScanHost;
	aMark->ToHost(aMarkHost);
	bMark->ToHost(bMarkHost);
	aScan->ToHost(aScanHost);
	bScan->ToHost(bScanHost);

	printf("A:\n");
	PrintArray(*aKeysUnique, "%6d", 10);
	printf("B:\n");
	PrintArray(*bKeysUnique, "%6d", 10);
	printf("\nA total:%d\nB total:%d\n", aCountDupFree, bCountDupFree);

	// Benchmark MGPU
    const int NT = 128;
    const int VT = 7;
    typedef LaunchBoxVT<NT, VT> Tuning;
    int2 launch = Tuning::GetLaunchParams(context);
    const int NV = launch.x * launch.y;
	typedef mgpu::less<typename std::iterator_traits<int*>::value_type> Comp;

    // BalancedPath search to establish partitions.
    MGPU_MEM(int) partitionsDevice = FindSetPartitions<false>(
		aKeysUnique->get(), aCountDupFree, bKeysUnique->get(), bCountDupFree, 
		NV, less<int>(), context);

	printf("FindSetPartitions:\n");
    PrintArray(*partitionsDevice, "%6d", 10 ); // numPart = 4

	MGPU_MEM(int) mergepartitionsDevice = MergePathPartitions<MgpuBoundsUpper>(
		aKeysUnique->get(), aCountDupFree, bKeysUnique->get(), bCountDupFree,
		NV, 0, less<int>(), context);

	printf("MergePathPartitions:\n");
    PrintArray(*mergepartitionsDevice, "%6d", 10 ); // numPart = 4

}

const int Tests[][2] = {
	{ 1500, 1 }
};
const int NumTests = sizeof(Tests) / sizeof(*Tests);

int main(int argc, char** argv) {
	ContextPtr context = CreateCudaDevice(argc, argv, true);

	typedef int T1;
	typedef int64 T2;

	for(int test = 0; test < 1; ++test)
		BenchmarkSetsPairs<T1>(Tests[test][0], Tests[test][1], *context);
	
	return 0;
}

Never mind, I figured out my mistake. This was a clever encoding of the block partitions by bitwise or'ing with 0x80000000. Then later on, the value can be recovered by bitwise and'ing with 0x7fffffff.