RuntimeError: UR error with LlamaIndex
idkSeth opened this issue · comments
Describe the bug
When trying to use IPEX LLM with LlamaIndex by following their example RuntimeError: UR error is raised.
Traceback (most recent call last):
File "/media/seth/Second/llama_index/ipex_test.py", line 52, in <module>
resp = llm.chat(messages)
^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py", line 322, in wrapper
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/llms/callbacks.py", line 173, in wrapped_llm_chat
f_return_val = f(_self, messages, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/llms/ipex_llm/base.py", line 472, in chat
completion_response = self.complete(prompt, formatted=True, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py", line 322, in wrapper
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/llms/callbacks.py", line 431, in wrapped_llm_predict
f_return_val = f(_self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/llms/ipex_llm/base.py", line 506, in complete
tokens = self._model.generate(
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/ipex_llm/transformers/pipeline_parallel.py", line 283, in generate
return original_generate(self,
^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/transformers/generation/utils.py", line 1359, in generate
and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: UR error
How to reproduce
Follow the LlamaIndex example by importing IpexLLM and loading a model.
# Transform a string into input zephyr-specific input
def completion_to_prompt(completion):
return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
# Transform a list of chat messages into zephyr-specific input
def messages_to_prompt(messages):
prompt = ""
for message in messages:
if message.role == "system":
prompt += f"<|system|>\n{message.content}</s>\n"
elif message.role == "user":
prompt += f"<|user|>\n{message.content}</s>\n"
elif message.role == "assistant":
prompt += f"<|assistant|>\n{message.content}</s>\n"
# ensure we start with a system prompt, insert blank if needed
if not prompt.startswith("<|system|>\n"):
prompt = "<|system|>\n</s>\n" + prompt
# add final assistant prompt
prompt = prompt + "<|assistant|>\n"
return prompt
from llama_index.llms.ipex_llm import IpexLLM
llm = IpexLLM.from_model_id(
model_name="HuggingFaceH4/zephyr-7b-alpha",
tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
context_window=512,
max_new_tokens=128,
generate_kwargs={"do_sample": False},
completion_to_prompt=completion_to_prompt,
messages_to_prompt=messages_to_prompt,
device_map="xpu",
)
Screenshots
If applicable, add screenshots to help explain the problem
Environment information
Attached output of env-check.sh. I have an iGPU that is not being detected but works(unrelated).
Additional context
It seems this issue stems from PyTorch IPEX, trying to perform various tensor operations on XPU results in RuntimeError: UR error. The code that causes the error can be fixed by moving the tensor to CPU before the sum operation, but the same error is raised elsewhere.
Traceback (most recent call last):
File "/media/seth/Second/llama_index/ipex_test.py", line 52, in <module>
resp = llm.chat(messages)
^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py", line 322, in wrapper
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/llms/callbacks.py", line 173, in wrapped_llm_chat
f_return_val = f(_self, messages, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/llms/ipex_llm/base.py", line 472, in chat
completion_response = self.complete(prompt, formatted=True, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py", line 322, in wrapper
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/core/llms/callbacks.py", line 431, in wrapped_llm_predict
f_return_val = f(_self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/llama_index/llms/ipex_llm/base.py", line 506, in complete
tokens = self._model.generate(
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/ipex_llm/transformers/pipeline_parallel.py", line 283, in generate
return original_generate(self,
^^^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/transformers/generation/utils.py", line 1474, in generate
return self.greedy_search(
^^^^^^^^^^^^^^^^^^^
File "/media/seth/Second/conda_llama-index-1/lib/python3.11/site-packages/transformers/generation/utils.py", line 2388, in greedy_search
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
RuntimeError: UR error