from llama_cpp import Llama from llama_cpp import ChatCompletionRequestMessage as Message from llama_cpp import ChatCompletionRequestSystemMessage as SystemMessage from llama_cpp import ChatCompletionRequestAssistantMessage as AssistantMessage from llama_cpp import ChatCompletionRequestUserMessage as UserMessage SYSTEM = 'system' USER = 'user' ASSISTANT = 'assistant' EXIT = 'exit' model_path = "zephyr-7b-beta.Q4_K_S.gguf" llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100) # Set chat_format according to the model you are using class Chat: def __init__(self, model: Llama) -> None: self.model: Llama = model self.messages: list[Message] = [ SystemMessage( role=SYSTEM, content='You are a helpful developer assistant, answer all the questions correctly and concisely.' ), AssistantMessage(role=ASSISTANT, content='Hello, do you have any question?'), ] def send_message(self, content: str): new_message = UserMessage(role=USER, content=content) self.messages.append(new_message) def generate_reply(self) -> str: response = self.model.create_chat_completion( messages=self.messages, temperature=0.7, top_p=0.9, top_k=20, max_tokens=128 ) response_content = response['choices'][0]['message'] self.messages.append(AssistantMessage(role=ASSISTANT, content=response_content)) return response_content