DachengLi1 / LongChat

Official repository for LongChat and LongEval

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

OutOfMemoryError: CUDA out of memory.

brewswang opened this issue · comments

I have 9 V100 16G GPUs,but training CUDA out of memory. The specific errors are as follows:
Formatting inputs...Skip in lazy mode
/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:295: UserWarning: FSDP is switching to use NO_SHARD instead of ShardingStrategy.FULL_SHARD since the world size is 1.
warnings.warn(
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /nvme/soft/brewswang/chatgpt/LongChat/longchat/train/fine_tune/train_condense_16K.py:15 in │
│ │
│ │
│ 12 from longchat.train.fine_tune.train import train │
│ 13 │
│ 14 if name == "main": │
│ ❱ 15 │ train() │
│ 16 │
│ │
│ /nvme/soft/brewswang/chatgpt/LongChat/longchat/train/fine_tune/train.py:262 in train │
│ │
│ 259 │ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): │
│ 260 │ │ trainer.train(resume_from_checkpoint=True) │
│ 261 │ else: │
│ ❱ 262 │ │ trainer.train() │
│ 263 │ trainer.save_state() │
│ 264 │ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) │
│ 265 │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:16 │
│ 62 in train │
│ │
│ 1659 │ │ inner_training_loop = find_executable_batch_size( │
│ 1660 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1661 │ │ ) │
│ ❱ 1662 │ │ return inner_training_loop( │
│ 1663 │ │ │ args=args, │
│ 1664 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1665 │ │ │ trial=trial, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:17 │
│ 49 in _inner_training_loop │
│ │
│ 1746 │ │ if args.gradient_checkpointing: │
│ 1747 │ │ │ self.model.gradient_checkpointing_enable() │
│ 1748 │ │ │
│ ❱ 1749 │ │ model = self._wrap_model(self.model_wrapped) │
│ 1750 │ │ │
│ 1751 │ │ if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: │
│ 1752 │ │ │ self._load_from_checkpoint(resume_from_checkpoint, model) │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:14 │
│ 89 in _wrap_model │
│ │
│ 1486 │ │ │ │ │ for arg in ["limit_all_gathers", "forward_prefetch", "backward_prefe │
│ 1487 │ │ │ │ │ │ if arg in signature: │
│ 1488 │ │ │ │ │ │ │ kwargs[arg] = getattr(self, arg) │
│ ❱ 1489 │ │ │ │ │ self.model = model = FSDP( │
│ 1490 │ │ │ │ │ │ model, │
│ 1491 │ │ │ │ │ │ sharding_strategy=self.fsdp, │
│ 1492 │ │ │ │ │ │ cpu_offload=cpu_offload, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/ful │
│ ly_sharded_data_parallel.py:391 in init
│ │
│ 388 │ │ │ │ # process groups. │
│ 389 │ │ │ │ fsdp_kwargs["process_group"] = (self.process_group, self._inter_node_pg) │
│ 390 │ │ │ │
│ ❱ 391 │ │ │ _auto_wrap(auto_wrap_kwargs, fsdp_kwargs, FullyShardedDataParallel) │
│ 392 │ │ │
│ 393 │ │ backward_prefetch_limit = 1 │
│ 394 │ │ forward_prefetch_limit = 1 │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_wr │
│ ap_utils.py:73 in _auto_wrap │
│ │
│ 70 │ │ │ "kernels do not support low precision." │
│ 71 │ │ ) │
│ 72 │ auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy │
│ ❱ 73 │ _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs) │
│ 74 │
│ 75 │
│ 76 def _get_fully_sharded_module_to_states( │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │
│ p.py:370 in _recursive_wrap │
│ │
│ 367 │ │ for name, child in module.named_children(): │
│ 368 │ │ │ if child in ignored_modules: │
│ 369 │ │ │ │ continue │
│ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │
│ 371 │ │ │ │ module=child, │
│ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │
│ 373 │ │ │ │ wrapper_cls=wrapper_cls, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │
│ p.py:370 in _recursive_wrap │
│ │
│ 367 │ │ for name, child in module.named_children(): │
│ 368 │ │ │ if child in ignored_modules: │
│ 369 │ │ │ │ continue │
│ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │
│ 371 │ │ │ │ module=child, │
│ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │
│ 373 │ │ │ │ wrapper_cls=wrapper_cls, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │
│ p.py:370 in _recursive_wrap │
│ │
│ 367 │ │ for name, child in module.named_children(): │
│ 368 │ │ │ if child in ignored_modules: │
│ 369 │ │ │ │ continue │
│ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │
│ 371 │ │ │ │ module=child, │
│ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │
│ 373 │ │ │ │ wrapper_cls=wrapper_cls, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │
│ p.py:388 in _recursive_wrap │
│ │
│ 385 │ │ │ module=module, recurse=False, nonwrapped_numel=remainder │
│ 386 │ │ ): │
│ 387 │ │ │ # Leaf node or final wrapping of the remainder both happen here. │
│ ❱ 388 │ │ │ return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel │
│ 389 │ │ else: │
│ 390 │ │ │ return module, total_wrapped_numel │
│ 391 │ return module, 0 │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │
│ p.py:317 in _wrap │
│ │
│ 314 │ │ overrides = {**kwargs, **module._wrap_overrides} # type: ignore[arg-type] │
│ 315 │ │ return wrapper_cls(module, **overrides) │
│ 316 │ │
│ ❱ 317 │ return wrapper_cls(module, **kwargs) │
│ 318 │
│ 319 │
│ 320 def _recursive_wrap( │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/ful │
│ ly_sharded_data_parallel.py:408 in init
│ │
│ 405 │ │ _init_runtime_state(self) │
│ 406 │ │ _init_prefetching_state(self, backward_prefetch, forward_prefetch) │
│ 407 │ │ _init_buffer_state(self, module) │
│ ❱ 408 │ │ _init_param_handle_from_module( │
│ 409 │ │ │ self, │
│ 410 │ │ │ module, │
│ 411 │ │ │ device_id, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_in │
│ it_utils.py:429 in _init_param_handle_from_module │
│ │
│ 426 │ │ _sync_module_params_and_buffers( │
│ 427 │ │ │ fully_sharded_module, managed_params, state.process_group │
│ 428 │ │ ) │
│ ❱ 429 │ _init_param_handle_from_params(state, managed_params, fully_sharded_module) │
│ 430 │ return state │
│ 431 │
│ 432 │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_in │
│ it_utils.py:525 in _init_param_handle_from_params │
│ │
│ 522 ): │
│ 523 │ if len(params) == 0: │
│ 524 │ │ return │
│ ❱ 525 │ handle = FlatParamHandle( │
│ 526 │ │ params, │
│ 527 │ │ fully_sharded_module, │
│ 528 │ │ state.compute_device, │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │
│ t_param.py:366 in init
│ │
│ 363 │ │ self._training_state = HandleTrainingState.IDLE │
│ 364 │ │ self._debug_level = dist.get_debug_level() │
│ 365 │ │ self._fully_sharded_module = fully_sharded_module │
│ ❱ 366 │ │ self._init_flat_param(params, fully_sharded_module, use_orig_params) │
│ 367 │ │ self._orig_param_dtype = self.flat_param.dtype │
│ 368 │ │ self._use_unsharded_views(as_params=False) │
│ 369 │ │ self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype) │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │
│ t_param.py:462 in _init_flat_param │
│ │
│ 459 │ │ │ "Passed-in params were not found in the module tree\n" │
│ 460 │ │ │ f"params: {params}\nmodule: {module}" │
│ 461 │ │ ) │
│ ❱ 462 │ │ self.flat_param = FlatParamHandle.flatten_params( │
│ 463 │ │ │ params_to_flatten, requires_grad │
│ 464 │ │ ) │
│ 465 │ │ # For use_orig_params=True, ensure that the logical parameters are │
│ │
│ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │
│ t_param.py:505 in flatten_params │
│ │
│ 502 │ │ │ │ p.detach().reshape(-1) if isinstance(p, nn.Parameter) else p.reshape(-1) │
│ 503 │ │ │ │ for p in params │
│ 504 │ │ │ ] │
│ ❱ 505 │ │ │ flat_param_data = torch.cat(flat_params, dim=0) │
│ 506 │ │ flat_param = FlatParameter(flat_param_data, requires_grad=requires_grad) │
│ 507 │ │ return flat_param │
│ 508 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
OutOfMemoryError: CUDA out of memory. Tried to allocate 774.00 MiB (GPU 0; 15.78 GiB total capacity; 14.62 GiB already allocated;
369.69 MiB free; 14.72 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to
avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23015) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python
Traceback (most recent call last):
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in
main()
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:
<NO_OTHER_FAILURES>

Root Cause (first observed failure):
[0]:
time : 2023-07-02_15:05:45
host : localhost.localdomain
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 23015)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

@brewswang Thanks for trying out the training code! In this release, the code has only been tested on 8xA100 for a 7B model, because of the very long sequence length causes high memory consumption. To run on V100 16GB, first change the monkey_patch here from flash attention to xformer.

There are several things to try:

(1) use FSDP cpu offloading.
(2) Try lower sequence length. (remember also change the ratio to lower ones, e.g. 2 for 4K, 4 for 8K).

Let me know if it works for you!

my train_condense_16K.py file content:

Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.

Need to call this before importing transformers.

from longchat.train.monkey_patch.llama_condense_monkey_patch import replace_llama_with_condense

replace_llama_with_condense(ratio=8)

from longchat.train.monkey_patch.llama_xformer_monkey_patch import replace_llama_attn_with_xformer

replace_llama_attn_with_xformer()

from longchat.train.fine_tune.train import train

if name == "main":
train()
my train comend:
CUDA_VISIBLE_DEVICES=0,2,3,4,5,6,7,8,9 python -m torch.distributed.run --nproc_per_node=1
longchat/train/fine_tune/train_condense_16K.py
--model_name_or_path model/open_llama_7b/
--data_path data/dummy_conversation.json
--bf16 False
--output_dir outputs/models
--num_train_epochs 1
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--gradient_accumulation_steps 1
--evaluation_strategy no
--save_strategy steps
--save_steps 1000
--save_total_limit 1
--learning_rate 2e-5
--weight_decay 0.
--warmup_ratio 0.03
--lr_scheduler_type "cosine"
--logging_steps 1
--fsdp "no_shard offload"
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer'
--tf32 False
--model_max_length 100
--gradient_checkpointing True
--lazy_preprocess True

output:
OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 15.78 GiB total capacity; 14.94 GiB already allocated;
41.69 MiB free; 15.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to
avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23915) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python
Traceback (most recent call last):
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in
main()
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Please change nproc_per_node to the number of GPU you have, also I would suggest using 8 GPUs instead of 9.

I got errors:
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24901 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24902 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24903 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24904 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24906 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24907 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24908 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24909 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 24905) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python
Traceback (most recent call last):
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in
main()
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:
<NO_OTHER_FAILURES>

Root Cause (first observed failure):
[0]:
time : 2023-07-02_18:08:34
host : localhost.localdomain
rank : 4 (local_rank: 4)
exitcode : -9 (pid: 24905)
error_file: <N/A>
traceback : Signal 9 (SIGKILL) received by PID 24905

I got errors:

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24901 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24902 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24903 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24904 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24906 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24907 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24908 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24909 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 24905) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python
Traceback (most recent call last):
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in
main()
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:

<NO_OTHER_FAILURES>

Root Cause (first observed failure):

[0]:
time : 2023-07-02_18:08:34
host : localhost.localdomain
rank : 4 (local_rank: 4)
exitcode : -9 (pid: 24905)
error_file: <N/A>
traceback : Signal 9 (SIGKILL) received by PID 24905

i also met this problem, i guess it ouccered by RAM memory