nomic-ai / gpt4all

Feature Request

I'd like to be able to start a new chat_session() but then populate its history from my own recorded logs, rather than having to use that context manager for an entire chat. Basically I want to do this:

with model.chat_session():
    model._history.append({"role": "user", "content": "2 names for a pet pelican"})
    model._history.append({"role": "assistant", "content": "Charlie, and Polly"})
    print(model.generate("3 more"))

This looks like it should work - I added some debug code and the output_collector used inside the model looks like this just before the prompt is executed:

[
  {'role': 'system', 'content': ''},
  {'role': 'user', 'content': '2 names for a pet pelican'},
  {'role': 'assistant', 'content': 'Charlie, and Polly'},
  {'role': 'user', 'content': '3 more'},
  {'role': 'assistant', 'content': ''}
]

But the model says things like:

It seems like your request is incomplete. Could you please provide additional information or clarify what "3 more" refers to? If it's related to a specific task, quantity, sequence, or something else numerical in nature, I would be happy to assist further!

So clearly the trick of adding things to _history directly like that doesn't work!

I'd love it if there was an official, documented way to do this. I need it for my https://github.com/simonw/llm-gpt4all/ project.

Relevant code:

gpt4all/gpt4all-bindings/python/gpt4all/gpt4all.py

Lines 534 to 603 in 2025d2d

    
           if self._history is not None: 
        
               # check if there is only one message, i.e. system prompt: 
        
               reset = len(self._history) == 1 
        
               self._history.append({"role": "user", "content": prompt}) 
        
               fct_func = self._format_chat_prompt_template.__func__  # type: ignore[attr-defined] 
        
               if fct_func is GPT4All._format_chat_prompt_template: 
        
                   if reset: 
        
                       # ingest system prompt 
        
                       # use "%1%2" and not "%1" to avoid implicit whitespace 
        
                       self.model.prompt_model(self._history[0]["content"], "%1%2", 
        
                                               empty_response_callback, 
        
                                               n_batch=n_batch, n_predict=0, reset_context=True, special=True) 
        
                   prompt_template = self._current_prompt_template.format("%1", "%2") 
        
               else: 
        
                   warnings.warn( 
        
                       "_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.", 
        
                       DeprecationWarning, 
        
                   ) 
        
                   # special tokens won't be processed 
        
                   prompt = self._format_chat_prompt_template( 
        
                       self._history[-1:], 
        
                       self._history[0]["content"] if reset else "", 
        
                   ) 
        
                   prompt_template = "%1" 
        
                   generate_kwargs["reset_context"] = reset 
        
           else: 
        
               prompt_template = "%1" 
        
               generate_kwargs["reset_context"] = True 
        
           # Prepare the callback, process the model response 
        
           output_collector: list[MessageType] 
        
           output_collector = [ 
        
               {"content": ""} 
        
           ]  # placeholder for the self._history if chat session is not activated 
        
           if self._history is not None: 
        
               self._history.append({"role": "assistant", "content": ""}) 
        
               output_collector = self._history 
        
           def _callback_wrapper( 
        
               callback: ResponseCallbackType, 
        
               output_collector: list[MessageType], 
        
           ) -> ResponseCallbackType: 
        
               def _callback(token_id: int, response: str) -> bool: 
        
                   nonlocal callback, output_collector 
        
                   output_collector[-1]["content"] += response 
        
                   return callback(token_id, response) 
        
               return _callback 
        
           # Send the request to the model 
        
           if streaming: 
        
               return self.model.prompt_model_streaming( 
        
                   prompt, 
        
                   prompt_template, 
        
                   _callback_wrapper(callback, output_collector), 
        
                   **generate_kwargs, 
        
               ) 
        
           self.model.prompt_model( 
        
               prompt, 
        
               prompt_template, 
        
               _callback_wrapper(callback, output_collector), 
        
               **generate_kwargs, 
        
           ) 
        
           return output_collector[-1]["content"]

My best guess is that self.model.prompt_model_streaming has its own internal state, which is why my attempts to manipulate the state in the outer layer are having no effect.

Maybe the previous tokens are accumulated in this low-level tokens C array, and that's the thing that isn't updated if you add stuff to _history?

gpt4all/gpt4all-bindings/python/gpt4all/_pyllmodel.py

Lines 53 to 70 in 2025d2d

    
           class LLModelPromptContext(ctypes.Structure): 
        
               _fields_ = [ 
        
                   ("logits", ctypes.POINTER(ctypes.c_float)), 
        
                   ("logits_size", ctypes.c_size_t), 
        
                   ("tokens", ctypes.POINTER(ctypes.c_int32)), 
        
                   ("tokens_size", ctypes.c_size_t), 
        
                   ("n_past", ctypes.c_int32), 
        
                   ("n_ctx", ctypes.c_int32), 
        
                   ("n_predict", ctypes.c_int32), 
        
                   ("top_k", ctypes.c_int32), 
        
                   ("top_p", ctypes.c_float), 
        
                   ("min_p", ctypes.c_float), 
        
                   ("temp", ctypes.c_float), 
        
                   ("n_batch", ctypes.c_int32), 
        
                   ("repeat_penalty", ctypes.c_float), 
        
                   ("repeat_last_n", ctypes.c_int32), 
        
                   ("context_erase", ctypes.c_float), 
        
               ]

I asked for sothing similar today #2358
I tried to clear() current_chat_session for a new chat without leaving the context manager but that is also being ignored.

My simple GUI: https://github.com/woheller69/gpt4all-TK-CHAT

Aha: spotted this which happens only if self._history is None:

gpt4all/gpt4all-bindings/python/gpt4all/gpt4all.py

Line 562 in 2025d2d

generate_kwargs["reset_context"] = True

That must be the mechanism that resets the internal token state.

More details on why I need this here:

simonw/llm-gpt4all#35

My LLM tool works by logging messages and responses to a SQLite database, so you can do things like this:

llm "three names for a pet pelican"
# Outputs three names
llm -c "2 more" # -c means continue previous thread
# Outputs two more names

In order to get GPT4All working correctly as a plugin for my tool I need the ability to instantiate a new model and then start a chat session with the previous context populated from my persisted SQLite version - but I can't figure out a way to do that.

You might use llama-cpp-agent (https://github.com/Maximilian-Winter/llama-cpp-agent) and llama-cpp-python instead of gpt4all.
I am also experimenting with it: https://github.com/woheller69/LLAMA_TK_CHAT/blob/main/LLAMA_TK_GUI.py

There you can do things like: self.llama_cpp_agent.chat_history.get_message_store().add_assistant_message(...)

The way we accomplished support for initial chat session messages in the node bindings is using fake_reply . But I think its not exposed/documented as a user facing parameter in the py bindings. It looks intentional, but idk about the exact reasoning. May wanna expose it, or add some other way to allow for that "conversation restore" functionality that encapsulates fake_reply. I believe it was initially added to allow for similar functionality in gpt4all-chat.

There might also be an alternative way to hack around it using the prompt template parameter + special=true and sending in the whole turns "pre-templated", including assistant response with n_predict=0.

	if self._history is not None:
	# check if there is only one message, i.e. system prompt:
	reset = len(self._history) == 1
	self._history.append({"role": "user", "content": prompt})

	fct_func = self._format_chat_prompt_template.__func__ # type: ignore[attr-defined]
	if fct_func is GPT4All._format_chat_prompt_template:
	if reset:
	# ingest system prompt
	# use "%1%2" and not "%1" to avoid implicit whitespace
	self.model.prompt_model(self._history[0]["content"], "%1%2",
	empty_response_callback,
	n_batch=n_batch, n_predict=0, reset_context=True, special=True)
	prompt_template = self._current_prompt_template.format("%1", "%2")
	else:
	warnings.warn(
	"_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.",
	DeprecationWarning,
	)
	# special tokens won't be processed
	prompt = self._format_chat_prompt_template(
	self._history[-1:],
	self._history[0]["content"] if reset else "",
	)
	prompt_template = "%1"
	generate_kwargs["reset_context"] = reset
	else:
	prompt_template = "%1"
	generate_kwargs["reset_context"] = True

	# Prepare the callback, process the model response
	output_collector: list[MessageType]
	output_collector = [
	{"content": ""}
	] # placeholder for the self._history if chat session is not activated

	if self._history is not None:
	self._history.append({"role": "assistant", "content": ""})
	output_collector = self._history

	def _callback_wrapper(
	callback: ResponseCallbackType,
	output_collector: list[MessageType],
	) -> ResponseCallbackType:
	def _callback(token_id: int, response: str) -> bool:
	nonlocal callback, output_collector

	output_collector[-1]["content"] += response

	return callback(token_id, response)

	return _callback

	# Send the request to the model
	if streaming:
	return self.model.prompt_model_streaming(
	prompt,
	prompt_template,
	_callback_wrapper(callback, output_collector),
	**generate_kwargs,
	)

	self.model.prompt_model(
	prompt,
	prompt_template,
	_callback_wrapper(callback, output_collector),
	**generate_kwargs,
	)

	return output_collector[-1]["content"]

	class LLModelPromptContext(ctypes.Structure):
	_fields_ = [
	("logits", ctypes.POINTER(ctypes.c_float)),
	("logits_size", ctypes.c_size_t),
	("tokens", ctypes.POINTER(ctypes.c_int32)),
	("tokens_size", ctypes.c_size_t),
	("n_past", ctypes.c_int32),
	("n_ctx", ctypes.c_int32),
	("n_predict", ctypes.c_int32),
	("top_k", ctypes.c_int32),
	("top_p", ctypes.c_float),
	("min_p", ctypes.c_float),
	("temp", ctypes.c_float),
	("n_batch", ctypes.c_int32),
	("repeat_penalty", ctypes.c_float),
	("repeat_last_n", ctypes.c_int32),
	("context_erase", ctypes.c_float),
	]

[Feature] Ability to populate previous chat history when using chat_session()

Feature Request