llama_7b

Paused

File size: 2,815 Bytes

de2c662
 
 
 
 
 
b8e4ce9
de2c662
 
 
b8e4ce9
 
de2c662
 
 
 
 
 
 
 
b8e4ce9
de2c662
 
b8e4ce9
 
de2c662
b8e4ce9
de2c662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8e4ce9
de2c662
 
 
 
b8e4ce9
de2c662
b8e4ce9
de2c662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8e4ce9
 
de2c662
b8e4ce9

import gradio as gr

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel, PeftConfig

# Set the model name and load the tokenizer and configuration for the model
MODEL_NAME = "IlyaGusev/llama_7b_ru_turbo_alpaca_lora"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = PeftConfig.from_pretrained(MODEL_NAME)

# Load the model and set it to evaluation mode
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto"
)
model = PeftModel.from_pretrained(model, MODEL_NAME)
model.eval()

# Define a function to generate a prompt based on the user's input
def generate_prompt(instruction, input=None):
    if input:
        return f"Task: {instruction}\nInput: {input}\nOutput:"
    return f"Task: {instruction}\n\nOutput:"

# Define a function to evaluate the user's input and generate text based on it
def evaluate(
    instruction,
    input=None,
    temperature=1.0,
    top_p=1.0,
    top_k=40,
    num_beams=3,
    max_new_tokens=256,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s, skip_special_tokens=True)
    return output.strip()

# Set up a Gradio interface for the evaluation function
g = gr.Interface(
    fn=evaluate,
    inputs=[
        gr.components.Textbox(
            lines=2, label="Task", placeholder="Why is grass green?"
        ),
        gr.components.Textbox(lines=2, label="Input", placeholder="None"),
        gr.components.Slider(minimum=0, maximum=2, value=1.0, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Top p"),
        gr.components.Slider(minimum=0, maximum=100, value=40, label="Top k"),
        gr.components.Slider(minimum=1, maximum=5, step=1, value=4, label="Beams"),
        gr.components.Slider(
            minimum=1, maximum=256, step=1, value=256, label="Max tokens"
        ),
    ],
    outputs=[
        gr.inputs.Textbox(
            lines=5,
            label="Output",
        )
    ],
    title="LLaMA 7B Ru Turbo Alpaca",
    description="",
)

# Queue the Gradio interface and launch it
g.queue(concurrency_count=1)
g.launch()