import gradio as gr
import time
from llm import Gemma2B

llm = Gemma2B()


def echo(message, history, system_prompt, tokens):
    chat_template = []
    for user, model in history:
        chat_template = [
            {"role": "user", "content": user},
            {"role": "model", "content": model},
        ]
    response = llm.inference_quantized_4bit(
        chat_template + [{"role": "user", "content": message}
                         ]).split("<start_of_turn>")[-1].strip("model").strip("<eos>")
    for i in range(max(len(response), int(tokens))):
        time.sleep(0.05)
        yield response[: i + 1]


demo = gr.ChatInterface(echo,
                        additional_inputs=[
                            gr.Textbox("You are helpful AI.", label="System Prompt"),
                            gr.Slider(10, 200, 100, label="Tokens")
                        ]
                        )
demo.queue().launch(debug=True)