CUDA out of memory

Question

CUDA out of memory

ama454 opened this issue 2 years ago · comments

if training_args.do_train:
print(f"last_checkpoint: {last_checkpoint}")
train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
trainer.save_model()
feature_extractor.save_pretrained(training_args.output_dir)
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

The following columns in the training set don't have a corresponding argument in `HubertForSpeechClassification.forward` and have been ignored: name, emotion. If name, emotion are not expected by `HubertForSpeechClassification.forward`, you can safely ignore this message.
/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
* Running training *
Num examples = 105
Num Epochs = 15
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 64
Gradient Accumulation steps = 2
Total optimization steps = 30
Number of trainable parameters = 90764163
last_checkpoint: None

OutOfMemoryError Traceback (most recent call last)
in
1 if training_args.do_train:
2 print(f"last_checkpoint: {last_checkpoint}")
----> 3 train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
4 trainer.save_model()
5 feature_extractor.save_pretrained(training_args.output_dir)

14 frames
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1552 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1553 )
-> 1554 return inner_training_loop(
1555 args=args,
1556 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1800 tr_loss_step = self.training_step(model, inputs)
1801 else:
-> 1802 tr_loss_step = self.training_step(model, inputs)
1803
1804 if (

in training_step(self, model, inputs)
43 if self.use_cuda_amp:
44 with autocast():
---> 45 loss = self.compute_loss(model, inputs)
46 else:
47 loss = self.compute_loss(model, inputs)

/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
2580 else:
2581 labels = None
-> 2582 outputs = model(**inputs)
2583 # Save past state if it exists
2584 # TODO: this needs to be fixed and made cleaner later.

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

in forward(self, input_values, attention_mask, output_attentions, output_hidden_states, return_dict, labels)
74 ):
75 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
---> 76 outputs = self.hubert(
77 input_values,
78 attention_mask=attention_mask,

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.8/dist-packages/transformers/models/hubert/modeling_hubert.py in forward(self, input_values, attention_mask, mask_time_indices, output_attentions, output_hidden_states, return_dict)
1063 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1064
-> 1065 extract_features = self.feature_extractor(input_values)
1066 extract_features = extract_features.transpose(1, 2)
1067

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.8/dist-packages/transformers/models/hubert/modeling_hubert.py in forward(self, input_values)
357 )
358 else:
--> 359 hidden_states = conv_layer(hidden_states)
360
361 return hidden_states

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.8/dist-packages/transformers/models/hubert/modeling_hubert.py in forward(self, hidden_states)
257 def forward(self, hidden_states):
258 hidden_states = self.conv(hidden_states)
--> 259 hidden_states = self.layer_norm(hidden_states)
260 hidden_states = self.activation(hidden_states)
261 return hidden_states

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/normalization.py in forward(self, input)
271
272 def forward(self, input: Tensor) -> Tensor:
--> 273 return F.group_norm(
274 input, self.num_groups, self.weight, self.bias, self.eps)
275

/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py in group_norm(input, num_groups, weight, bias, eps)
2526 return handle_torch_function(group_norm, (input, weight, bias,), input, num_groups, weight=weight, bias=bias, eps=eps)
2527 _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
-> 2528 return torch.group_norm(input, num_groups, weight, bias, eps, torch.backends.cudnn.enabled)
2529
2530

OutOfMemoryError: CUDA out of memory. Tried to allocate 13.72 GiB (GPU 0; 14.76 GiB total capacity; 7.70 GiB already allocated; 5.75 GiB free; 7.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

ama454 · Answer 1 · Thu Feb 02 2023 15:53:31 GMT+0800 (China Standard Time)

This error appeared in HuBERT, although the dataset used is much less than BAVED.

Omar Mahhmoud · Answer 2 · Thu Feb 02 2023 19:45:58 GMT+0800 (China Standard Time)

try to reduce batch size to 16 or 8

ama454 · Answer 3 · Thu Feb 02 2023 20:21:58 GMT+0800 (China Standard Time)

same problem

ama454 · Answer 4 · Thu Feb 02 2023 20:25:46 GMT+0800 (China Standard Time)

If i pay for Google Colab pro, will it solve the problem?

ama454 · Answer 5 · Fri Feb 03 2023 02:07:40 GMT+0800 (China Standard Time)

I bought Google Colab pro and still the same error