Error in FindSetPartitions
ctcyang opened this issue · comments
moderngpu git tag: V1.1
Ubuntu 14.04.5 LTS (GNU/Linux 3.13.0-106-generic x86_64)
gcc-4.8.real (Ubuntu 4.8.5-2ubuntu1~14.04.1) 4.8.5
Cuda compilation tools, release 8.0, V8.0.44
I noticed that FindSetPartitions()
generates an error whenever the number of elements is bigger than one block. For example, using FindSetPartitions()
on two sorted arrays A and B of length 591 and 584 respectively I am getting:
0: 0 -2147483198 591
when the expected result should match that of MergePathPartitions()
:
0: 0 449 591
Minimum working example:
#include "kernels/sets.cuh"
#include "kernels/intervalmove.cuh"
#include <algorithm>
using namespace mgpu;
template<typename T>
__global__ void setZero( T *keys, const int total ) {
for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
gridDim.x ) {
keys[idx] = 0;
}
}
template<typename T>
__global__ void lookRight( const T *keys, T *mark, const int total ) {
for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
gridDim.x ) {
if( keys[idx+1]!=keys[idx] ) mark[idx] = 1;
else mark[idx] = 0;
}
}
template<typename T>
__global__ void streamCompact( const T *mark, const T *scan, T *out, const int total ) {
for( int idx=threadIdx.x+blockIdx.x*blockDim.x; idx<total; idx+=blockDim.x*
gridDim.x )
if( mark[idx] ) out[scan[idx]] = idx;
}
template<typename T>
void BenchmarkSetsPairs(int count, int numIt, CudaContext& context) {
int aCount = count / 2; // number of A elements
int bCount = count - aCount; // number of B elements
MGPU_MEM(T) aKeys = context.SortRandom<T>(aCount, 0, count);
MGPU_MEM(T) bKeys = context.SortRandom<T>(bCount, 0, count);
MGPU_MEM(T) aValues = context.FillAscending<T>(aCount, 0, 1);
MGPU_MEM(T) bValues = context.FillAscending<T>(bCount, 0, 1);
MGPU_MEM(T) aMark = context.Malloc<T>(aCount);
MGPU_MEM(T) bMark = context.Malloc<T>(bCount);
MGPU_MEM(T) aScan = context.Malloc<T>(aCount);
MGPU_MEM(T) bScan = context.Malloc<T>(bCount);
const int NTHREADS = 128;
const int NBLOCKS = 480;
setZero<<<NBLOCKS,NTHREADS>>>( aMark->get(), aCount );
setZero<<<NBLOCKS,NTHREADS>>>( bMark->get(), bCount );
lookRight<<<NBLOCKS,NTHREADS>>>( aKeys->get(), aMark->get(), aCount-1 );
lookRight<<<NBLOCKS,NTHREADS>>>( bKeys->get(), bMark->get(), bCount-1 );
int aCountDupFree = 0;
int bCountDupFree = 0;
// Only do scan for first aCount-1 since last one isn't defined
Scan<MgpuScanTypeExc>( aMark->get(), aCount-1, 0, plus<int>(), (int*)0,
&aCountDupFree, aScan->get(), context );
Scan<MgpuScanTypeExc>( bMark->get(), bCount-1, 0, plus<int>(), (int*)0,
&bCountDupFree, bScan->get(), context );
MGPU_MEM(T) aInd = context.Malloc<T>(aCountDupFree);
MGPU_MEM(T) bInd = context.Malloc<T>(bCountDupFree);
MGPU_MEM(T) aKeysUnique = context.Malloc<T>(aCountDupFree);
MGPU_MEM(T) bKeysUnique = context.Malloc<T>(bCountDupFree);
streamCompact<<<NBLOCKS,NTHREADS>>>( aMark->get(), aScan->get(),
aInd->get(), aCount );
streamCompact<<<NBLOCKS,NTHREADS>>>( bMark->get(), bScan->get(),
bInd->get(), bCount );
IntervalGather( aCountDupFree, aInd->get(), counting_iterator<int>(0),
aCountDupFree, aKeys->get(), aKeysUnique->get(), context );
IntervalGather( bCountDupFree, bInd->get(), counting_iterator<int>(0),
bCountDupFree, bKeys->get(), bKeysUnique->get(), context );
std::vector<T> aMarkHost, bMarkHost;
std::vector<T> aScanHost, bScanHost;
aMark->ToHost(aMarkHost);
bMark->ToHost(bMarkHost);
aScan->ToHost(aScanHost);
bScan->ToHost(bScanHost);
printf("A:\n");
PrintArray(*aKeysUnique, "%6d", 10);
printf("B:\n");
PrintArray(*bKeysUnique, "%6d", 10);
printf("\nA total:%d\nB total:%d\n", aCountDupFree, bCountDupFree);
// Benchmark MGPU
const int NT = 128;
const int VT = 7;
typedef LaunchBoxVT<NT, VT> Tuning;
int2 launch = Tuning::GetLaunchParams(context);
const int NV = launch.x * launch.y;
typedef mgpu::less<typename std::iterator_traits<int*>::value_type> Comp;
// BalancedPath search to establish partitions.
MGPU_MEM(int) partitionsDevice = FindSetPartitions<false>(
aKeysUnique->get(), aCountDupFree, bKeysUnique->get(), bCountDupFree,
NV, less<int>(), context);
printf("FindSetPartitions:\n");
PrintArray(*partitionsDevice, "%6d", 10 ); // numPart = 4
MGPU_MEM(int) mergepartitionsDevice = MergePathPartitions<MgpuBoundsUpper>(
aKeysUnique->get(), aCountDupFree, bKeysUnique->get(), bCountDupFree,
NV, 0, less<int>(), context);
printf("MergePathPartitions:\n");
PrintArray(*mergepartitionsDevice, "%6d", 10 ); // numPart = 4
}
const int Tests[][2] = {
{ 1500, 1 }
};
const int NumTests = sizeof(Tests) / sizeof(*Tests);
int main(int argc, char** argv) {
ContextPtr context = CreateCudaDevice(argc, argv, true);
typedef int T1;
typedef int64 T2;
for(int test = 0; test < 1; ++test)
BenchmarkSetsPairs<T1>(Tests[test][0], Tests[test][1], *context);
return 0;
}
Never mind, I figured out my mistake. This was a clever encoding of the block partitions by bitwise or'ing with 0x80000000
. Then later on, the value can be recovered by bitwise and'ing with 0x7fffffff
.