import gradio as gr import time from llm import Gemma2B llm = Gemma2B() def echo(message, history, system_prompt, tokens): chat_template = [] for user, model in history: chat_template = [ {"role": "user", "content": user}, {"role": "model", "content": model}, ] response = llm.inference_quantized_4bit( chat_template + [{"role": "user", "content": message} ]).split("")[-1].strip("model").strip("") for i in range(max(len(response), int(tokens))): time.sleep(0.05) yield response[: i + 1] demo = gr.ChatInterface(echo, additional_inputs=[ gr.Textbox("You are helpful AI.", label="System Prompt"), gr.Slider(10, 200, 100, label="Tokens") ] ) demo.queue().launch(debug=True)