cupy.cuda.driver.CUDADriverError: CUDA_ERROR_INVALID_PTX: a PTX JIT compilation failed
sandeepnmenon opened this issue · comments
Getting the error at the beginning of the first epoch while trying to run the resume script for vKITTI
Total number of parameters: 53601
Module(
(ecc): GraphNetwork(
(0): RNNGraphConvModule(
(_cell): GRUCellEx(
32, 32
(ini): InstanceNorm1d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(inh): InstanceNorm1d(1, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(ig): Linear(in_features=32, out_features=32, bias=True)
)(ingate layernorm)
(_fnet): Sequential(
(0): Linear(in_features=13, out_features=32, bias=True)
(1): ReLU(inplace=True)
(2): Linear(in_features=32, out_features=128, bias=True)
(3): ReLU(inplace=True)
(4): Linear(in_features=128, out_features=64, bias=True)
(5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(6): ReLU(inplace=True)
(7): Linear(in_features=64, out_features=32, bias=False)
)
)
(1): Linear(in_features=32, out_features=13, bias=True)
)
(ptn): PointNet(
(stn): STNkD(
(convs): Sequential(
(0): Conv1d(9, 32, kernel_size=(1,), stride=(1,))
(1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv1d(32, 64, kernel_size=(1,), stride=(1,))
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
)
(fcs): Sequential(
(0): Linear(in_features=64, out_features=32, bias=True)
(1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=32, out_features=16, bias=True)
(4): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
)
(proj): Linear(in_features=16, out_features=4, bias=True)
)
(convs): Sequential(
(0): Conv1d(9, 64, kernel_size=(1,), stride=(1,))
(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
(7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(8): ReLU(inplace=True)
)
(fcs): Sequential(
(0): Linear(in_features=129, out_features=64, bias=True)
(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=64, out_features=32, bias=True)
(4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=32, out_features=32, bias=True)
)
)
)
0%| | 0/15 [00:00<?, ?it/s]
Traceback (most recent call last):
File "./learning/main.py", line 459, in <module>
main()
File "./learning/main.py", line 381, in main
acc_test, oacc_test, avg_iou_test, per_class_iou_test, predictions_test, avg_acc_test, confusion_matrix = eval_final()
File "./learning/main.py", line 287, in eval_final
outputs = model.ecc(embeddings)
File "/home/deepenai/anaconda3/envs/superpoint/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/deepenai/SandeepMenon/superpoint/superpoint_graph/learning/../learning/graphnet.py", line 97, in forward
input = module(input)
File "/home/deepenai/anaconda3/envs/superpoint/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/deepenai/SandeepMenon/superpoint/superpoint_graph/learning/../learning/modules.py", line 175, in forward
input = ecc.GraphConvFunction.apply(hx, weights, nc, nc, idxn, idxe, degs, degs_gpu,
File "/home/deepenai/SandeepMenon/superpoint/superpoint_graph/learning/../learning/ecc/GraphConvModule.py", line 79, in forward
cuda_kernels.conv_aggregate_fw(output.narrow(0, startd, numd), products.view(-1, ctx._out_channels),
File "/home/deepenai/SandeepMenon/superpoint/superpoint_graph/learning/../learning/ecc/cuda_kernels.py", line 125, in conv_aggregate_fw
function, stream = get_kernel_func('conv_aggregate_fw_kernel_v2', conv_aggregate_fw_kernel_v2(), get_dtype(src))
File "/home/deepenai/SandeepMenon/superpoint/superpoint_graph/learning/../learning/ecc/cuda_kernels.py", line 43, in get_kernel_func
module.load(bytes(ptx.encode()))
File "cupy/cuda/function.pyx", line 241, in cupy.cuda.function.Module.load
File "cupy/cuda/function.pyx", line 243, in cupy.cuda.function.Module.load
File "cupy_backends/cuda/api/driver.pyx", line 246, in cupy_backends.cuda.api.driver.moduleLoadData
File "cupy_backends/cuda/api/driver.pyx", line 124, in cupy_backends.cuda.api.driver.check_status
cupy_backends.cuda.api.driver.CUDADriverError: CUDA_ERROR_INVALID_PTX: a PTX JIT compilation failed
Environment details
Pytorch: 1.8.1
cupy: 8.6.0
CUDA: 11.0
Torch geometric: 1.7.0
gcc and g++: 7.5.0-3ubuntu1~18.04
I know the versions are higher that what is mentioned in the readme, but I was able to run the "Learned Partition script" for training in the above environment, using the shell script and was able to run the quality evaluation script for it as well.
Looks like an issue with the cupy installation. Uninstall cupy and reinstall with pip install cupy-cuda110
Thank you. The issue was the mismatch of cuda toolkit