NVlabs / RVT

Official Code for RVT-2 and RVT

Home Page:https://robotic-view-transformer-2.github.io/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Issues while running the training script

hars-singh opened this issue · comments

Hi, Thanks for your great work!!

I am trying to run the training script as it is without using the replay buffer.
I am getting the following errors.

Start training ...
Rank [0], Epoch [0]: Training on train dataset
Rank [0], Epoch [0]: Training on train dataset
Rank [0], Epoch [0]: Training on train dataset
Rank [0], Epoch [0]: Training on train dataset
Traceback (most recent call last):
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 300, in
mp.spawn(experiment, args=(cmd_args, devices, port), nprocs=len(devices), join=True)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 3 terminated with the following error:
Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 260, in experiment
out = train(agent, train_dataset, TRAINING_ITERATIONS, rank)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 54, in train
raw_batch = next(data_iter)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
data = next(self.dataset_iter)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/wrappers/pytorch_replay_buffer.py", line 41, in _generator
yield self._replay_buffer.sample_transition_batch(pack_in_dict=True, distribution_mode = self._sample_distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 772, in sample_transition_batch
indices = self.sample_index_batch(batch_size, distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 706, in sample_index_batch
state_index = np.random.randint(low = self._task_replay_start_index[task_index],
File "mtrand.pyx", line 765, in numpy.random.mtrand.RandomState.randint
File "_bounded_integers.pyx", line 1247, in numpy.random._bounded_integers._rand_int64
ValueError: high <= 0

Traceback (most recent call last):
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 300, in
mp.spawn(experiment, args=(cmd_args, devices, port), nprocs=len(devices), join=True)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 260, in experiment
out = train(agent, train_dataset, TRAINING_ITERATIONS, rank)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 54, in train
raw_batch = next(data_iter)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
data = next(self.dataset_iter)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/wrappers/pytorch_replay_buffer.py", line 41, in _generator
yield self._replay_buffer.sample_transition_batch(pack_in_dict=True, distribution_mode = self._sample_distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 772, in sample_transition_batch
indices = self.sample_index_batch(batch_size, distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 706, in sample_index_batch
state_index = np.random.randint(low = self._task_replay_start_index[task_index],
File "mtrand.pyx", line 765, in numpy.random.mtrand.RandomState.randint
File "_bounded_integers.pyx", line 1247, in numpy.random._bounded_integers._rand_int64
ValueError: high <= 0

Traceback (most recent call last):
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 300, in
mp.spawn(experiment, args=(cmd_args, devices, port), nprocs=len(devices), join=True)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 3 terminated with the following error:
Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 260, in experiment
out = train(agent, train_dataset, TRAINING_ITERATIONS, rank)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 54, in train
raw_batch = next(data_iter)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
data = next(self.dataset_iter)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/wrappers/pytorch_replay_buffer.py", line 41, in _generator
yield self._replay_buffer.sample_transition_batch(pack_in_dict=True, distribution_mode = self._sample_distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 772, in sample_transition_batch
indices = self.sample_index_batch(batch_size, distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 706, in sample_index_batch
state_index = np.random.randint(low = self._task_replay_start_index[task_index],
File "mtrand.pyx", line 765, in numpy.random.mtrand.RandomState.randint
File "_bounded_integers.pyx", line 1247, in numpy.random._bounded_integers._rand_int64
ValueError: high <= 0

Traceback (most recent call last):
File "", line 1, in
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "", line 1, in
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 300, in
mp.spawn(experiment, args=(cmd_args, devices, port), nprocs=len(devices), join=True)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 3 terminated with the following error:
Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 260, in experiment
out = train(agent, train_dataset, TRAINING_ITERATIONS, rank)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/train.py", line 54, in train
raw_batch = next(data_iter)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
data = next(self.dataset_iter)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/wrappers/pytorch_replay_buffer.py", line 41, in _generator
yield self._replay_buffer.sample_transition_batch(pack_in_dict=True, distribution_mode = self._sample_distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 772, in sample_transition_batch
indices = self.sample_index_batch(batch_size, distribution_mode)
File "/home/kiyogi/harsh/RVT_related_stuff/RVT/RVT/rvt/libs/YARR/yarr/replay_buffer/uniform_replay_buffer.py", line 706, in sample_index_batch
state_index = np.random.randint(low = self._task_replay_start_index[task_index],
File "mtrand.pyx", line 765, in numpy.random.mtrand.RandomState.randint
File "_bounded_integers.pyx", line 1247, in numpy.random._bounded_integers._rand_int64
ValueError: low >= high

srun: error: gpu-11: task 2: Exited with exit code 1
Traceback (most recent call last):
File "", line 1, in
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "", line 1, in
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/kiyogi/miniconda3/envs/rvt/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
srun: error: gpu-11: tasks 0-1: Exited with exit code 1
srun: error: gpu-11: task 3: Exited with exit code 1