pytorch-labs / gpt-fast

Simple and efficient pytorch-native transformer text generation in <1000 LOC of python.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Bug convert HF model

vinhtran2611 opened this issue · comments

Bug Report

Description:

I encountered a bug when attempting to convert a model from Hugging Face (HF) using the provided code implementation. The issue appears to be related to counting parameters in the PyTorch model.

Code Implementation:

import re
import torch
from transformers import LlamaForCausalLM, AutoTokenizer

from models.model_configs import transformer_configs
from models.llama import ModelArgs

#Load model
model_path = "nickypro/tinyllama-15M-fp32"
model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype="auto", use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

tiny_llama_15M_config = ModelArgs.from_name('tinyllama-15M')
#ModelArgs(block_size=256, vocab_size=32000, n_layer=6, n_head=6, dim=288, intermediate_size=768, n_local_heads=6, #head_dim=48, rope_base=10000, norm_eps=1e-05)
mymodel = LLama(tiny_llama_15M_config)

#Convert HF
weight_map = {
    "model.embed_tokens.weight": "tok_embeddings.weight",
    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
    'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
    'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
    "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
    "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
    "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
    "model.norm.weight": "norm.weight",
    "lm_head.weight": "output.weight",
}

def permute(w, n_head, dim, head_dim):
    dim = dim
    return (
        w.view(n_head, 2, head_dim // 2, dim)
        .transpose(1, 2)
        .reshape(head_dim * n_head, dim)
    )

hf_path = "tests/tiny_15M_fp32.pt"
torch.save(model.state_dict(), hf_path)
checkpoint = torch.load(hf_path)

final_result = {}
for key, value in checkpoint.items():
    if "layers" in key:
        abstract_key = re.sub(r'(\d+)', '{}', key)
        layer_num = re.search(r'\d+', key).group(0)
        new_key = weight_map[abstract_key]
        if new_key is None:
            continue
        new_key = new_key.format(layer_num)
    else:
        new_key = weight_map[key]

    final_result[new_key] = value

for key in tuple(final_result.keys()):
    if "wq" in key:
        q = final_result[key]
        k = final_result[key.replace("wq", "wk")]
        v = final_result[key.replace("wq", "wv")]
        q = permute(q, tiny_llama_15M_cofig.n_head, tiny_llama_15M_cofig.dim, tiny_llama_15M_cofig.head_dim)
        k = permute(k, tiny_llama_15M_cofig.n_local_heads, tiny_llama_15M_cofig.dim, tiny_llama_15M_cofig.head_dim)
        final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
        del final_result[key]
        del final_result[key.replace("wq", "wk")]
        del final_result[key.replace("wq", "wv")]

torch.save(final_result,  "tests/model_converted.pth")
mymodel.load_state_dict(torch.load("tests/model_converted.pth"))

#Input
inputs = tokenizer("I am a student at", return_tensors="pt", return_attention_mask=False)
inp = inputs['input_ids']
T = inp.shape[1]
input_pos = torch.arange(0, T)

# Run HF
hf_outputs = model(**inputs)

#Run implement
mymodel.setup_caches(1, tiny_llama_15M_cofig.block_size)
output = mymodel(inp, input_pos)

# Output
hf_outputs.logits
tensor([[[ -6.7908,   0.8281,  -6.7904,  ...,  -6.7907,  -6.7907,  -6.7905],
         [ -8.2606,  -0.2434,  -8.2608,  ...,  -8.2607,  -8.2609,  -8.2608],
         [-10.8138,  -3.1881, -10.8137,  ..., -10.8138, -10.8139, -10.8138],
         [-11.4940,  -0.7831, -11.4936,  ..., -11.4939, -11.4938, -11.4937],
         [-11.8310,  -2.4853, -11.8308,  ..., -11.8310, -11.8310, -11.8310],
         [ -6.9855,   0.3798,  -6.9853,  ...,  -6.9855,  -6.9853,  -6.9854]]],
       grad_fn=<UnsafeViewBackward0>)

output
tensor([[[ -6.7908,   0.8281,  -6.7904,  ...,  -6.7907,  -6.7907,  -6.7905],
         [ -8.2573,  -0.2481,  -8.2575,  ...,  -8.2574,  -8.2576,  -8.2575],
         [-10.8115,  -3.1968, -10.8114,  ..., -10.8116, -10.8116, -10.8115],
         [-11.4960,  -0.7812, -11.4957,  ..., -11.4959, -11.4959, -11.4957],
         [-11.8348,  -2.4808, -11.8346,  ..., -11.8348, -11.8348, -11.8348],
         [ -6.9774,   0.3842,  -6.9772,  ...,  -6.9774,  -6.9772,  -6.9773]]],
       grad_fn=<UnsafeViewBackward0>)

diff  = torch.sum(abs(hf_outputs.logits - output), -1)
tensor([[  0.0000,  99.5280,  80.2527,  58.3247,  99.8532, 236.9683]],
       grad_fn=<SumBackward1>)

I think the bug lies in the Key-Value (KV) cache, as the output for the first token remains unchanged

@Chillee @kit1980

Have you solved this problem? I also found the tensor are different not only the first token, but all the logits of the tokens are different. @Chillee Can you help to take a look at this problem?

@vinhtran2611
I have set AutoModelForCausalLM.from_pretrained(torch_dtype=torch.bfloat16) and _load_model(precision=torch.bfloat16)

But I get hf_outputs.logits.dtype == torch .float32 and output.dtype == torch.bfloat16. Maybe it's the precision problem.