你好,指令微调,单机多卡出现如下错误
DendiHust opened this issue · comments
启动脚本
export NCCL_P2P_DISABLE=1
export CUDA_LAUNCH_BLOCKING=1
CUDA_VISIBLE_DEVICES=0,1 python finetune/lora/finetune.py \
--data_path "/home/qizhen/CaMA/data/" \
--base_model "/home/qizhen/pre_models/cama/cama" \
--batch_size 4 \
--micro_batch_size 4 \
--num_epochs 8 \
--learning_rate 3e-4 \
--cutoff_len 512 \
--lora_r 16 \
--lora_alpha 32 \
--lora_dropout 0.05 \
--save_steps 1 \
--save_total_limit 20 \
--eval_steps 1 \
--logging_steps 5
错误日志
CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64...
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...
Training Alpaca-LoRA model with params:
base_model: /home/qizhen/pre_models/cama/cama
data_path: /home/qizhen/CaMA/data/
output_dir: ./checkpoint
batch_size: 4
micro_batch_size: 4
num_epochs: 8
learning_rate: 0.0003
cutoff_len: 512
val_set_size: 2000
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules: ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj']
train_on_inputs: False
group_by_length: False
wandb_project:
wandb_run_name:
wandb_watch:
wandb_log_model:
resume_from_checkpoint: False
prompt template: alpaca
Loading checkpoint shards: 100%|██████████| 6/6 [00:39<00:00, 6.61s/it]
Found cached dataset json (/home/qizhen/.cache/huggingface/datasets/json/default-40d7b2f86cd0b651/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
trainable params: 62586880 || all params: 13078451200 || trainable%: 0.4785496313202591
data includes: ['/home/qizhen/CaMA/data/sft_all.json']
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 43.45it/s]
Loading cached split indices for dataset at /home/qizhen/.cache/huggingface/datasets/json/default-40d7b2f86cd0b651/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-c0aaf1542311ded0.arrow and /home/qizhen/.cache/huggingface/datasets/json/default-40d7b2f86cd0b651/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-dea9c392c84db597.arrow
Map (num_proc=8): 100%|█████████▉| 744342/744391 [01:52<00:00, 1294.17 examples/s]
Map (num_proc=8): 97%|█████████▋| 1931/2000 [00:00<00:00, 5187.79 examples/s]
0%| | 0/1488784 [00:00<?, ?it/s]/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [44,0,0], thread: [0,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [44,0,0], thread: [1,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [20,0,0], thread: [127,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
Traceback (most recent call last):
File "/home/qizhen/CaMA/finetune/lora/finetune.py", line 296, in <module>
fire.Fire(train)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/home/qizhen/CaMA/finetune/lora/finetune.py", line 286, in train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/trainer.py", line 1662, in train
return inner_training_loop(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/trainer.py", line 2699, in training_step
loss = self.compute_loss(model, inputs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/trainer.py", line 2731, in compute_loss
outputs = model(**inputs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/peft/peft_model.py", line 530, in forward
return self.base_model(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 687, in forward
outputs = self.model(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 577, in forward
layer_outputs = decoder_layer(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 292, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 204, in forward
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
File "/home/qizhen/anaconda3/envs/cama/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 137, in apply_rotary_pos_emb
sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
RuntimeError: CUDA error: device-side assert triggered
0%| | 0/1488784 [00:13<?, ?it/s]
CUDA_VISIBLE_DEVICES=0,1 accelerate launch finetune/lora/finetune.py
感谢您的关注与反馈,建议您在使用多卡训练时使用这个命令。我们也已经修改相关说明。
并且请检查您的数据集是否按照格式
[
{"instruction": "", "input":"", "output":""},
{"instruction": "", "input":"", "output":""},
...
]
请问您的问题是否已解决
请问您的问题是否已解决
解决了,感谢