import torch from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline from peft import PeftModel import re import gradio as gr tokenizer = LlamaTokenizer.from_pretrained('mostafaamiri/persian_llama_7b') base_model = LlamaForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", load_in_8bit=False, ) base_model.resize_token_embeddings(len(tokenizer)) model = PeftModel.from_pretrained( base_model, "mostafaamiri/persian_llama_7b",) prompt_input = ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n\n{instruction}\n\n### Response:\n\n" ) def generate_prompt(instruction, input=None): if input: instruction = instruction + '\n' + input return prompt_input.format_map({'instruction': instruction}) config=dict( temperature=0.2, top_k=40, top_p=0.9, do_sample=True, num_beams=1, repetition_penalty=1.2, max_new_tokens=300 ) def launch_model(text): sample_data = [text] inputToken = tokenizer(generate_prompt(sample_data) , return_tensors="pt") outputs = model.generate(**inputToken, **config) output = tokenizer.decode(outputs[0],skip_special_tokens=True) output = re.sub(r"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n\n\[.*\]\n\n### Response:\n\n", "", output) return output iface = gr.Interface(fn=launch_model, inputs="text", outputs="text") iface.launch()