HuangLK / transpeeder

train llama on a single A100 80G node using 🤗 transformers and 🚀 Deepspeed Pipeline Parallelism

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

RuntimeError: element 1 of tensors does not require grad and does not have a grad_fn

SparkJiao opened this issue · comments

Hi, wonderful work!

I didn't use your code but I following your code to implement my own llama-pipeline parallelism. But I'm encountering the following problem. May I know if you have encountered similar problems? I have no ideas about the solution.

Thanks for your help very much!

The error message:

Traceback (most recent call last)
  File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 418, in <module>
    main() 
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/main.py", line 90, in decorated_main
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 389, in _run_hydra
    _run_app( 
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 452, in _run_app
    run_and_report(
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 216, in run_and_report
    raise ex
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 213, in run_and_report
    return func()
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/utils.py", line 453, in <lambda>                                                 
    lambda: hydra.run(
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run                                                                          
    _ = ret.return_value 
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value                                                                      
    raise self._return_value
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job                                                                           
    ret.return_value = task_function(task_cfg)
  File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 352, in main                                                                                                                    
    global_step, tr_loss = train(cfg, model, tokenizer, continue_from_global_step)                                                                                                          
  File "/home/fangkai/merit-v2/trainer_base_ds_mp.py", line 212, in train                                                                                                                   
    loss = model.train_batch(sub_train_dataloader)                                                                                                                                          
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 336, in train_batch 
    self._exec_schedule(sched) 
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 1307, in _exec_schedule 
    self._exec_instr(**cmd.kwargs)
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed/runtime/pipe/engine.py", line 733, in _exec_backward_pass
    torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)                                                                                                                 
  File "/home/fangkai/anaconda3/envs/py3.9/lib/python3.9/site-packages/torch/autograd/__init__.py", line 200, in backward                                                                   
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass                                                                                          
RuntimeError: element 1 of tensors does not require grad and does not have a grad_fn

Here is a toy dataset:

class TestDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        super().__init__()
        self.data = ["My name is Jiao Fangkai."]

    def __len__(self):
        return 100000000

    def __getitem__(self, index):
        return {"flan": {
            "inputs": self.data[0],
            "targets": self.data[0],
        }}

Here is the collator:

def vanilla_seq2seq_convertor(examples, tokenizer: PreTrainedTokenizer, max_seq_length, decoder_only: bool = False):
    inputs = []
    outputs = []
    for exp in examples:
        inputs.append(exp["inputs"])
        if decoder_only:
            outputs.append(exp["inputs"] + " " + exp["targets"] + tokenizer.eos_token)
        else:
            outputs.append(exp["targets"])

    model_inputs = tokenizer(inputs, text_target=outputs, max_length=max_seq_length, padding="longest",
                             truncation=True, return_tensors="pt")
    if decoder_only:
        input_lens = model_inputs["input_ids"].ne(tokenizer.pad_token_id).sum(dim=1)
        model_inputs = tokenizer(outputs, max_length=max_seq_length, padding="longest",
                                 truncation=True, return_tensors="pt")
        new_input_lens = model_inputs["input_ids"].ne(tokenizer.pad_token_id).sum(dim=1)
        input_lens = input_lens - input_lens.eq(new_input_lens).to(input_lens.dtype) * (input_lens // 2)
        input_lens = input_lens.to(torch.long)
        model_inputs["input_lens"] = input_lens

    return model_inputs

def get_lm_labels(input_lens, input_ids, pad_token_id):
    labels = input_ids.clone()

    label_mask = labels.ne(pad_token_id)
    lens_mask = torch.arange(labels.size(1))[None, :] >= input_lens[:, None]
    label_mask = label_mask & lens_mask

    labels = labels.masked_fill(~label_mask, -100).contiguous()

    return labels

class FlanCollatorOverCollator:
    def __init__(self, tokenizer: str, max_seq_length: int, decoder_only: bool = False):
        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=False)
        expand_special_tokenizer(self.tokenizer)
        self.max_seq_length = max_seq_length
        self.decoder_only = decoder_only

    def __call__(self, batch):
        flan_batch = []
        for item in batch:
            flan_batch.append(item.pop("flan"))

        model_inputs = vanilla_seq2seq_convertor(flan_batch, self.tokenizer, self.max_seq_length, self.decoder_only)

        
        # Add suffix `input_ids` to tackle the deepspeed logic.
        seq_length = model_inputs["input_ids"].size(1)
        position_ids = torch.arange(0, seq_length, dtype=torch.long)
        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        return (
                (
                    model_inputs["input_ids"],
                    model_inputs["attention_mask"],
                    # position_ids,
                    # model_inputs["input_lens"],
                    # model_inputs["input_ids"].detach().clone()
                ),
                # model_inputs["input_ids"].detach().clone()
                get_lm_labels(model_inputs["input_lens"], model_inputs["input_ids"], self.tokenizer.pad_token_id)
        )

        return model_inputs

And the initialization:

topo = PipeModelDataParallelTopology(num_pp=4, num_mp=1, num_dp=1)
model = PipelineModule(layers=layers,
                           # num_stages=cfg.num_stages,
                           topology=topo,
                           loss_fn=models.llama_ds_mp_wrap.loss_fn,
                           activation_checkpoint_interval=getattr(cfg, "activation_checkpoint_interval", 0))