import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
from peft import  PeftModel
import re
import gradio as gr

tokenizer = LlamaTokenizer.from_pretrained('mostafaamiri/persian_llama_7b')

base_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_8bit=False,
    )
base_model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(
    base_model, 
    "mostafaamiri/persian_llama_7b",)


prompt_input = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n\n{instruction}\n\n### Response:\n\n"
)


def generate_prompt(instruction, input=None):
    if input:
        instruction = instruction + '\n' + input
    return prompt_input.format_map({'instruction': instruction})
config=dict(
    temperature=0.2,
    top_k=40,
    top_p=0.9,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.2,
    max_new_tokens=300
    )


def launch_model(text):

    sample_data = [text]
    inputToken = tokenizer(generate_prompt(sample_data) , return_tensors="pt")

    outputs = model.generate(**inputToken, **config)
    output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    output = re.sub(r"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n\n\[.*\]\n\n### Response:\n\n", "", output)

    return output


iface = gr.Interface(fn=launch_model, inputs="text", outputs="text")
iface.launch()