Error building SAM when using DeepSpeed Zero3 CPU-offload
hazby2002 opened this issue · comments
HolyK commented
Traceback (most recent call last):
File "/home/haybzer/workspace/NExT-Chat/mllm/pipeline/finetune.py", line 141, in <module>
main()
File "/home/haybzer/workspace/NExT-Chat/mllm/pipeline/finetune.py", line 34, in main
model, preprocessor = load_pretrained(cfg.model_args, training_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/workspace/NExT-Chat/mllm/models/builder/builder.py", line 15, in load_pretrained
return load_pretrained_nextchat(model_args, training_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/workspace/NExT-Chat/mllm/models/builder/build_nextchat.py", line 128, in load_pretrained_nextchat
model = NextChatForSegLM.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2700, in from_pretrained
model = cls(config, *model_args, **model_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
f(module, *args, **kwargs)
File "/home/haybzer/workspace/NExT-Chat/mllm/models/nextchat/nextchat_seg.py", line 17, in __init__
self.sam = SamForLMSeg("vit_h", config.sam_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
f(module, *args, **kwargs)
File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1346, in __init__
self.model = sam_model_registry[model_type](checkpoint=ckpt)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1246, in build_sam_vit_h
return _build_sam(
^^^^^^^^^^^
File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1297, in _build_sam
sam = Sam(
^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
f(module, *args, **kwargs)
File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1099, in __init__
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 253, in new_tensor
tensor = _orig_torch_empty(0, device=device).new_empty(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: new_empty(): argument 'size' (position 1) must be tuple of ints, but found element of type float at pos 0
Traceback (most recent call last):
File "/home/haybzer/anaconda3/envs/next_chat/bin/accelerate", line 10, in <module>
sys.exit(main())
^^^^^^
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1057, in launch_command
simple_launcher(args)
File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/launch.py", line 673, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/haybzer/anaconda3/envs/next_chat/bin/python3.11', 'mllm/pipeline/finetune.py', 'config/nextchat_stage3_deepspeed.py', '--cfg-options', 'model_args.model_name_or_path=./ckpt/nextchat-7b-336', 'model_args.mm_projector_depth=2', '--num_train_epochs', '3', '--save_steps', '5000', '--output_dir', './output/stage3']' returned non-zero exit status 1.
I tried to change torch.Tensor(pixel_mean)
to torch.tensor(pixel_mean)
, but that leads to a different error:
RuntimeError: Error(s) in loading state_dict for Sam:
size mismatch for image_encoder.pos_embed: copying a param with shape torch.Size([1, 64, 64, 1280]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for image_encoder.patch_embed.proj.weight: copying a param with shape torch.Size([1280, 3, 16, 16]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for image_encoder.patch_embed.proj.bias: copying a param with shape torch.Size([1280]) from checkpoint, the shape in current model is torch.Size([0]).
size mismatch for image_encoder.blocks.0.norm1.weight: copying a param with shape torch.Size([1280]) from checkpoint, the shape in current model is torch.Size([0]).
......
Here is my zero3.json
:
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}