chatGLM6b保存CUDA error when release memory!

Question

chatGLM6b保存CUDA error when release memory!

aofengdaxia opened this issue 6 months ago · comments

我使用了微调合并后的lora模型。使用fp16的格式进行聊天，经常遇到CUDA error = 4, cudaErrorCudartUnloading at /root/autodl-tmp/fastllm-master/src/devices/cuda/fastllm-cuda.cu:1493
'driver shutting down'
此类错误。

Shiyu Zhang · Answer 1 · Sat Jan 20 2024 20:06:06 GMT+0800 (China Standard Time)

调用代码如下:

from fastllm_pytools import llm
from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import llm


def glm_prompt3(query: str, history=[], system_prompt='', user_token=64795, system_token=64794, assistant_token=64796, observation=64797):
    prompt = f'<FLM_FIX_TOKEN_{system_token}>\n{system_prompt}\n'
    for i, (old_query, response) in enumerate(history):
            prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{old_query}\n"
            prompt += f"<FLM_FIX_TOKEN_{assistant_token}>\n{response}\n"
    prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{query}\n<FLM_FIX_TOKEN_{assistant_token}>"
    return prompt

def main():
    
    model = llm.model("/root/autodl-tmp/chatglm2-6b-fp16.flm")
    
    history = []
    system_prompt = """
    Answer the following questions as best as you can. You have access to the following tools:
[
    {
        "name": "get_current_weather222",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string"},
            },
            "required": ["location"],
        },
    }
]
"""
    while True:
        str = input("请输入：")
        prompt = glm_prompt3(str,system_prompt=system_prompt,history=history)
        model.direct_query = True
        if str == "exit":
            break
        if str == "clear":
            history = []
            continue
        result = ""
        
        for response in model.stream_response(prompt,temperature=0.8):
            print(response, flush = True, end = "")
            result += response
        
        history.append((str, result))

if __name__ == "__main__":
    main()