RuntimeError: element 1 of tensors does not require grad and does not have a grad_fn
SparkJiao opened this issue · comments
Hi, wonderful work!
I didn't use your code but I following your code to implement my own llama-pipeline parallelism. But I'm encountering the following problem. May I know if you have encountered similar problems? I have no ideas about the solution.
Thanks for your help very much!
The error message:
Traceback (most recent call last)
File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 418, in <module>
main()
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/main.py", line 90, in decorated_main
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 389, in _run_hydra
_run_app(
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 452, in _run_app
run_and_report(
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 216, in run_and_report
raise ex
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 213, in run_and_report
return func()
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 453, in <lambda>
lambda: hydra.run(
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 352, in main
global_step, tr_loss = train(cfg, model, tokenizer, continue_from_global_step)
File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 212, in train
loss = model.train_batch(sub_train_dataloader)
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 336, in train_batch
self._exec_schedule(sched)
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 1307, in _exec_schedule
self._exec_instr(**cmd.kwargs)
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 733, in _exec_backward_pass
torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)
File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/torch/autograd/__init__.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 1 of tensors does not require grad and does not have a grad_fn
Here is a toy dataset:
class TestDataset(Dataset):
def __init__(self, file_path, tokenizer):
super().__init__()
self.data = ["My name is Jiao Fangkai."]
def __len__(self):
return 100000000
def __getitem__(self, index):
return {"flan": {
"inputs": self.data[0],
"targets": self.data[0],
}}
Here is the collator:
def vanilla_seq2seq_convertor(examples, tokenizer: PreTrainedTokenizer, max_seq_length, decoder_only: bool = False):
inputs = []
outputs = []
for exp in examples:
inputs.append(exp["inputs"])
if decoder_only:
outputs.append(exp["inputs"] + " " + exp["targets"] + tokenizer.eos_token)
else:
outputs.append(exp["targets"])
model_inputs = tokenizer(inputs, text_target=outputs, max_length=max_seq_length, padding="longest",
truncation=True, return_tensors="pt")
if decoder_only:
input_lens = model_inputs["input_ids"].ne(tokenizer.pad_token_id).sum(dim=1)
model_inputs = tokenizer(outputs, max_length=max_seq_length, padding="longest",
truncation=True, return_tensors="pt")
new_input_lens = model_inputs["input_ids"].ne(tokenizer.pad_token_id).sum(dim=1)
input_lens = input_lens - input_lens.eq(new_input_lens).to(input_lens.dtype) * (input_lens // 2)
input_lens = input_lens.to(torch.long)
model_inputs["input_lens"] = input_lens
return model_inputs
def get_lm_labels(input_lens, input_ids, pad_token_id):
labels = input_ids.clone()
label_mask = labels.ne(pad_token_id)
lens_mask = torch.arange(labels.size(1))[None, :] >= input_lens[:, None]
label_mask = label_mask & lens_mask
labels = labels.masked_fill(~label_mask, -100).contiguous()
return labels
class FlanCollatorOverCollator:
def __init__(self, tokenizer: str, max_seq_length: int, decoder_only: bool = False):
self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=False)
expand_special_tokenizer(self.tokenizer)
self.max_seq_length = max_seq_length
self.decoder_only = decoder_only
def __call__(self, batch):
flan_batch = []
for item in batch:
flan_batch.append(item.pop("flan"))
model_inputs = vanilla_seq2seq_convertor(flan_batch, self.tokenizer, self.max_seq_length, self.decoder_only)
# Add suffix `input_ids` to tackle the deepspeed logic.
seq_length = model_inputs["input_ids"].size(1)
position_ids = torch.arange(0, seq_length, dtype=torch.long)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
return (
(
model_inputs["input_ids"],
model_inputs["attention_mask"],
# position_ids,
# model_inputs["input_lens"],
# model_inputs["input_ids"].detach().clone()
),
# model_inputs["input_ids"].detach().clone()
get_lm_labels(model_inputs["input_lens"], model_inputs["input_ids"], self.tokenizer.pad_token_id)
)
return model_inputs
And the initialization:
topo = PipeModelDataParallelTopology(num_pp=4, num_mp=1, num_dp=1)
model = PipelineModule(layers=layers,
# num_stages=cfg.num_stages,
topology=topo,
loss_fn=models.llama_ds_mp_wrap.loss_fn,
activation_checkpoint_interval=getattr(cfg, "activation_checkpoint_interval", 0))