Transducer GPU support
kylebgorman opened this issue · comments
Transducer training on GPU raises an error because it encounters a mixed CPU/GPU operation. Sample trace:
Epoch 0: 0%| | 0/294 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/kbg/.miniconda3/bin/yoyodyne-train", line 8, in <module>
sys.exit(main())
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/click/core.py", line 1128, in __call__
return self.main(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/yoyodyne/train.py", line 448, in main
trainer.fit(model, train_loader, eval_loader)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in fit
self._call_and_handle_interrupt(
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 723, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1236, in _run
results = self._run_stage()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1323, in _run_stage
return self._run_train()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1353, in _run_train
self.fit_loop.run()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 203, in advance
result = self._run_optimization(
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 369, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1595, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1646, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 155, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/torch/optim/optimizer.py", line 113, in wrapper
return func(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/torch/optim/adadelta.py", line 87, in step
loss = closure()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 140, in _wrap_closure
closure_result = closure()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 427, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values())
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1765, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 333, in training_step
return self.model.training_step(*args, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/yoyodyne/models/base.py", line 115, in training_step
preds = self(batch)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/yoyodyne/models/transducer.py", line 86, in forward
prediction, loss = self.decode(
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/yoyodyne/models/transducer.py", line 174, in decode
last_action = self.decode_action_step(
File "/home/kbg/.miniconda3/lib/python3.9/site-packages/yoyodyne/models/transducer.py", line 260, in decode_action_step
end_of_input = (input_length - alignment) <= 1 # 1 -> Last char.
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Epoch 0: 0%| | 0/294 [00:00<?, ?it/s]
This is a blocker for a post-beta release candidate.
Note: the issue is one of the initialized values in the decode_step
function. Debugging didn't catch difference in flags. Issue can be solved relatively quickly provided gpu. (Still my issue just putting it down for general note-keeping.)
I'm taking over this because it's related to another thing I'm working on, and I think I've got a handle on this. Nothing personal ;)
I'm taking over this because it's related to another thing I'm working on, and I think I've got a handle on this. Nothing personal ;)
Sgtm (I'm two weeks/stipend check out from getting to a GPU anyhow)