expertllama / app.py
SpiketheCowboy's picture
Upload folder using huggingface_hub
cd7462e
'''
CREDIT:
script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py).
'''
import gradio as gr
import random
import time
import transformers
import os
import json
import torch
import argparse
from tqdm import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
def apply_delta(base_model_path, target_model_path, delta_path):
print(f"Loading the delta weights from {delta_path}")
delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False)
delta = LlamaForCausalLM.from_pretrained(
delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
print(f"Loading the base model from {base_model_path}")
base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False)
base = LlamaForCausalLM.from_pretrained(
base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
# following alpaca training recipe, we have added new initialized tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
special_tokens_dict = {
"pad_token": DEFAULT_PAD_TOKEN,
"eos_token": DEFAULT_EOS_TOKEN,
"bos_token": DEFAULT_BOS_TOKEN,
"unk_token": DEFAULT_UNK_TOKEN,
}
num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict)
base.resize_token_embeddings(len(base_tokenizer))
input_embeddings = base.get_input_embeddings().weight.data
output_embeddings = base.get_output_embeddings().weight.data
input_embeddings[-num_new_tokens:] = 0
output_embeddings[-num_new_tokens:] = 0
print("Applying the delta")
target_weights = {}
for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
assert name in delta.state_dict()
param.data += delta.state_dict()[name]
target_weights[name] = param.data
print(f"Saving the target model to {target_model_path}")
base.load_state_dict(target_weights)
# base.save_pretrained(target_model_path)
# delta_tokenizer.save_pretrained(target_model_path)
delta = None
return base, delta_tokenizer
base_weights = 'decapoda-research/llama-7b-hf'
target_weights = 'expertllama' # local path
delta_weights = 'OFA-Sys/expertllama-7b-delta'
model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
model = model.to(torch.float)
if torch.__version__ >= "2":
model = torch.compile(model)
def respond(
instruction,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=128,
**kwargs,
):
# prompt wrapper, only single-turn is allowed for now
prompt = f"### Human:\n{instruction}\n\n### Assistant:\n"
inputs = tokenizer(
prompt,
return_tensors="pt",
add_special_tokens=False
)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
with torch.no_grad():
generation_output = model.generate(
input_ids=inputs["input_ids"],
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1]
return response
g = gr.Interface(
fn=respond,
inputs=[
gr.components.Textbox(
lines=2, label="Instruction"
),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
gr.components.Slider(
minimum=1, maximum=768, step=1, value=512, label="Max tokens"
),
],
outputs=[
gr.inputs.Textbox(
lines=8,
label="Output",
)
],
title="ExpertLLaMA",
description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.",
)
g.queue(concurrency_count=1)
g.launch()