ztxz16 / fastllm

纯c++的全平台llm加速库,支持python调用,chatglm-6B级模型单卡可达10000+token / s,支持glm, llama, moss基座,手机端流畅运行

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

chatGLM6b保存CUDA error when release memory!

aofengdaxia opened this issue · comments

我使用了微调合并后的lora模型。使用fp16的格式进行聊天,经常遇到CUDA error = 4, cudaErrorCudartUnloading at /root/autodl-tmp/fastllm-master/src/devices/cuda/fastllm-cuda.cu:1493
'driver shutting down'
此类错误。

调用代码如下:

from fastllm_pytools import llm
from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import llm


def glm_prompt3(query: str, history=[], system_prompt='', user_token=64795, system_token=64794, assistant_token=64796, observation=64797):
    prompt = f'<FLM_FIX_TOKEN_{system_token}>\n{system_prompt}\n'
    for i, (old_query, response) in enumerate(history):
            prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{old_query}\n"
            prompt += f"<FLM_FIX_TOKEN_{assistant_token}>\n{response}\n"
    prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{query}\n<FLM_FIX_TOKEN_{assistant_token}>"
    return prompt

def main():
    
    model = llm.model("/root/autodl-tmp/chatglm2-6b-fp16.flm")
    
    history = []
    system_prompt = """
    Answer the following questions as best as you can. You have access to the following tools:
[
    {
        "name": "get_current_weather222",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string"},
            },
            "required": ["location"],
        },
    }
]
"""
    while True:
        str = input("请输入:")
        prompt = glm_prompt3(str,system_prompt=system_prompt,history=history)
        model.direct_query = True
        if str == "exit":
            break
        if str == "clear":
            history = []
            continue
        result = ""
        
        for response in model.stream_response(prompt,temperature=0.8):
            print(response, flush = True, end = "")
            result += response
        
        history.append((str, result))

if __name__ == "__main__":
    main()