[BUG] CAGRA computation results will go wrong
FakeEnd opened this issue · comments
Describe the bug
When I want to compute the distance in a batch like [batch_size, samples, feature], I convert it to [batch_size * samples, feature] and then add a seperate constant to each batch_size. But computation results of CAGRA is still out of its boundary, running to other index.
Steps/Code to reproduce bug
CAGRA computation process
import torch
import numpy as np
from pylibraft.common import DeviceResources
from pylibraft.neighbors import cagra
import rmm
pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(),
initial_pool_size=2**30,
maximum_pool_size=2**32
)
rmm.mr.set_current_device_resource(pool)
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
batch_size = 1000
n_samples = 10
n_features = 64
# n_queries = 1000
k = 3
dataset = torch.randn(batch_size * n_samples, n_features, dtype=torch.float32).cuda()
handle = DeviceResources()
build_params = cagra.IndexParams(metric="sqeuclidean")
seperate_constant = torch.arange(batch_size).view(-1, 1).repeat(1, n_samples).view(-1, 1).cuda() * 1000
# seperate_constant [0, 0, 0, 1000, 1000, 1000, 2000, 2000, 2000, ...]
dataset += seperate_constant
index = cagra.build(build_params, dataset, handle=handle)
distances, neighbors = cagra.search(cagra.SearchParams(),
index, dataset,
k, handle=handle)
handle.sync()
neighbors = neighbors.copy_to_host().astype(np.int64)
neighbors = torch.from_numpy(neighbors).cuda()
To find where CAGRA computation is wrong
neighbors_real = neighbors.reshape(batch_size, n_samples, -1) - torch.arange(batch_size).view(-1, 1, 1).repeat(1, n_samples, k).cuda() * n_samples
neighbors_real_larger_n_samples = torch.nonzero(neighbors_real > n_samples).squeeze()
neighbors_real_smaller_0 = torch.nonzero(neighbors_real < 0).squeeze()
print(len(neighbors_real_larger_n_samples))
print(len(neighbors_real_smaller_0))
print(neighbors_real_larger_n_samples[0])
5726
6152
tensor([119, 3, 1], device='cuda:0')
Verif CAGRA computation wrong place
sample_index = neighbors_real_larger_n_samples[0][0]
# def a distance function using sqeuclidean
def sqeuclidean(x, y):
return torch.sum((x - y) ** 2, dim=-1)
calculate the distance between itself and its neighbors
sqeuclidean(dataset[sample_index*n_samples], dataset[sample_index*n_samples])
tensor(0., device='cuda:0')
sqeuclidean(dataset[sample_index*n_samples], dataset[sample_index*n_samples + 1])
tensor(109.7028, device='cuda:0')
calculate the distance between itself and CAGRA computation results
neighbors_index = neighbors[sample_index*n_samples]
print(neighbors_index)
tensor([1183, 1200, 1189], device='cuda:0')
sqeuclidean(dataset[sample_index*n_samples], dataset[neighbors_index[0]])
tensor(63966640., device='cuda:0')
sqeuclidean(dataset[sample_index*n_samples], dataset[neighbors_index[1]])
tensor(63977952., device='cuda:0')
I can not upload jupyter notebook file, so here is just a markdown file:
test_CAGRA_accuracy_simplified.md
Expected behavior
I expected CAGRA neighbors computaion results is right, which we can exam by using len(neighbors_real_larger_n_samples) and len(neighbors_real_smaller_0) to check it whether are all zero.
Environment details (please complete the following information):
- Environment location: A100
- Method of RAFT install: conda/pip
Additional context
I think maybe it has some connection with this issue #2066 .
I found this error has something connection with parameters and separate constant value.
If degree(including intermediate_graph_degree and graph_degree) sets to higher, it will be more correct.
If seperate_constant = torch.arange(batch_size).view(-1, 1).repeat(1, n_samples).view(-1, 1).cuda() * 1000
change 1000 to 100 or 10, the results will be more correct too. Maybe it has some connection with the largest number in the dataset?
I give a more easier testing code here:
If print output are all zero, the results will be all in their boundary.
import torch
import numpy as np
import cupy as cp
from pylibraft.common import DeviceResources
from pylibraft.neighbors import cagra
import rmm
pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(),
initial_pool_size=2**30,
maximum_pool_size=2**32
)
rmm.mr.set_current_device_resource(pool)
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
batch_size = 1000
n_samples = 10
n_features = 64
# n_queries = 1000
k = 3
dataset = torch.randn(batch_size * n_samples, n_features, dtype=torch.float32).cuda()
handle = DeviceResources()
build_params = cagra.IndexParams(metric="sqeuclidean", build_algo='nn_descent', intermediate_graph_degree=32, graph_degree=32,)
seperate_constant = torch.arange(batch_size).view(-1, 1).repeat(1, n_samples).view(-1, 1).cuda() * 1000
# seperate_constant looks like [0, 0, 0,..., 1000, 1000, 1000,..., 2000, 2000, 2000, ...]
dataset += seperate_constant
index = cagra.build(build_params, dataset, handle=handle)
distances, neighbors = cagra.search(cagra.SearchParams(),
index, dataset,
k, handle=handle)
handle.sync()
neighbors = torch.as_tensor(cp.asarray(neighbors).astype(cp.int64), device='cuda')
for i in range(batch_size):
print(i, sum( (neighbors[i*n_samples:(i+1)*n_samples]<i*n_samples) | (neighbors[i*n_samples:(i+1)*n_samples]>=(i+1)*n_samples) ).sum())
I found a more general case that CAGRA computation results will go wrong.
Here, I use two different ways to calcualte the neighbors: 1) first way is to calculate all the query together, 2) second way is to compute query batch by batch. The results of two ways are different, which can prove there have something wrong in the CAGRA.
import torch
import numpy as np
import cupy as cp
from pylibraft.common import DeviceResources
from pylibraft.neighbors import cagra
import rmm
pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(),
initial_pool_size=2**30,
maximum_pool_size=2**32
)
rmm.mr.set_current_device_resource(pool)
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
batch_size = 10
n_samples = 1000
n_features = 64
# n_queries = 1000
k = 3
key = torch.randn(batch_size * n_samples, n_features, dtype=torch.float32).cuda()
query = torch.randn(batch_size * n_samples, n_features, dtype=torch.float32).cuda()
handle = DeviceResources()
build_params = cagra.IndexParams(metric="sqeuclidean", build_algo='nn_descent', intermediate_graph_degree=128, graph_degree=128,)
index = cagra.build(build_params, key, handle=handle)
distances, neighbors = cagra.search(cagra.SearchParams(),
index, query,
k, handle=handle)
handle.sync()
neighbors = torch.as_tensor(cp.asarray(neighbors).astype(cp.int64), device='cuda')
distances = torch.as_tensor(distances, device='cuda')
# check if the neighbors are correct
distances_check = torch.zeros(batch_size * n_samples, k, dtype=torch.float32).cuda()
neighbors_check = torch.zeros(batch_size * n_samples, k, dtype=torch.int64).cuda()
for i in range(batch_size):
build_params = cagra.IndexParams(metric="sqeuclidean", build_algo='nn_descent', intermediate_graph_degree=32, graph_degree=32,)
index = cagra.build(build_params, key, handle=handle)
distances_index, neighbors_index = cagra.search(cagra.SearchParams(),
index, query[i*n_samples:(i+1)*n_samples],
k, handle=handle)
handle.sync()
distances_check[i*n_samples:(i+1)*n_samples] = torch.as_tensor(distances_index, device='cuda')
neighbors_check[i*n_samples:(i+1)*n_samples] = torch.as_tensor(cp.asarray(neighbors_index).astype(cp.int64), device='cuda')
print(i, (distances[i*n_samples:(i+1)*n_samples] - distances_check[i*n_samples:(i+1)*n_samples]).sum())
print(i, (neighbors[i*n_samples:(i+1)*n_samples] - neighbors_check[i*n_samples:(i+1)*n_samples]).sum())