Error building SAM when using DeepSpeed Zero3 CPU-offload

Question

Error building SAM when using DeepSpeed Zero3 CPU-offload

hazby2002 opened this issue 3 months ago · comments

Traceback (most recent call last):
  File "/home/haybzer/workspace/NExT-Chat/mllm/pipeline/finetune.py", line 141, in <module>
    main()
  File "/home/haybzer/workspace/NExT-Chat/mllm/pipeline/finetune.py", line 34, in main
    model, preprocessor = load_pretrained(cfg.model_args, training_args)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/builder/builder.py", line 15, in load_pretrained
    return load_pretrained_nextchat(model_args, training_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/builder/build_nextchat.py", line 128, in load_pretrained_nextchat
    model = NextChatForSegLM.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2700, in from_pretrained
    model = cls(config, *model_args, **model_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
    f(module, *args, **kwargs)
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/nextchat/nextchat_seg.py", line 17, in __init__
    self.sam = SamForLMSeg("vit_h", config.sam_path)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
    f(module, *args, **kwargs)
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1346, in __init__
    self.model = sam_model_registry[model_type](checkpoint=ckpt)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1246, in build_sam_vit_h
    return _build_sam(
           ^^^^^^^^^^^
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1297, in _build_sam
    sam = Sam(
          ^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 503, in wrapper
    f(module, *args, **kwargs)
  File "/home/haybzer/workspace/NExT-Chat/mllm/models/sam/modeling_sam.py", line 1099, in __init__
    self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
                                       ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 253, in new_tensor
    tensor = _orig_torch_empty(0, device=device).new_empty(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: new_empty(): argument 'size' (position 1) must be tuple of ints, but found element of type float at pos 0
Traceback (most recent call last):
  File "/home/haybzer/anaconda3/envs/next_chat/bin/accelerate", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
    args.func(args)
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1057, in launch_command
    simple_launcher(args)
  File "/home/haybzer/anaconda3/envs/next_chat/lib/python3.11/site-packages/accelerate/commands/launch.py", line 673, in simple_launcher
    raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/haybzer/anaconda3/envs/next_chat/bin/python3.11', 'mllm/pipeline/finetune.py', 'config/nextchat_stage3_deepspeed.py', '--cfg-options', 'model_args.model_name_or_path=./ckpt/nextchat-7b-336', 'model_args.mm_projector_depth=2', '--num_train_epochs', '3', '--save_steps', '5000', '--output_dir', './output/stage3']' returned non-zero exit status 1.

I tried to change torch.Tensor(pixel_mean) to torch.tensor(pixel_mean), but that leads to a different error:

RuntimeError: Error(s) in loading state_dict for Sam:
	size mismatch for image_encoder.pos_embed: copying a param with shape torch.Size([1, 64, 64, 1280]) from checkpoint, the shape in current model is torch.Size([0]).
	size mismatch for image_encoder.patch_embed.proj.weight: copying a param with shape torch.Size([1280, 3, 16, 16]) from checkpoint, the shape in current model is torch.Size([0]).
	size mismatch for image_encoder.patch_embed.proj.bias: copying a param with shape torch.Size([1280]) from checkpoint, the shape in current model is torch.Size([0]).
	size mismatch for image_encoder.blocks.0.norm1.weight: copying a param with shape torch.Size([1280]) from checkpoint, the shape in current model is torch.Size([0]).
......

Here is my zero3.json:

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_micro_batch_size_per_gpu": "auto",
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    }
}