chatGLM6b保存CUDA error when release memory!
aofengdaxia opened this issue · comments
Shiyu Zhang commented
我使用了微调合并后的lora模型。使用fp16的格式进行聊天,经常遇到CUDA error = 4, cudaErrorCudartUnloading at /root/autodl-tmp/fastllm-master/src/devices/cuda/fastllm-cuda.cu:1493
'driver shutting down'
此类错误。
Shiyu Zhang commented
调用代码如下:
from fastllm_pytools import llm
from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import llm
def glm_prompt3(query: str, history=[], system_prompt='', user_token=64795, system_token=64794, assistant_token=64796, observation=64797):
prompt = f'<FLM_FIX_TOKEN_{system_token}>\n{system_prompt}\n'
for i, (old_query, response) in enumerate(history):
prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{old_query}\n"
prompt += f"<FLM_FIX_TOKEN_{assistant_token}>\n{response}\n"
prompt += f"<FLM_FIX_TOKEN_{user_token}>\n{query}\n<FLM_FIX_TOKEN_{assistant_token}>"
return prompt
def main():
model = llm.model("/root/autodl-tmp/chatglm2-6b-fp16.flm")
history = []
system_prompt = """
Answer the following questions as best as you can. You have access to the following tools:
[
{
"name": "get_current_weather222",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string"},
},
"required": ["location"],
},
}
]
"""
while True:
str = input("请输入:")
prompt = glm_prompt3(str,system_prompt=system_prompt,history=history)
model.direct_query = True
if str == "exit":
break
if str == "clear":
history = []
continue
result = ""
for response in model.stream_response(prompt,temperature=0.8):
print(response, flush = True, end = "")
result += response
history.append((str, result))
if __name__ == "__main__":
main()