Spaces:
Runtime error
Runtime error
''' | |
CREDIT: | |
script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py). | |
''' | |
import gradio as gr | |
import random | |
import time | |
import transformers | |
import os | |
import json | |
import torch | |
import argparse | |
from tqdm import tqdm | |
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
def apply_delta(base_model_path, target_model_path, delta_path): | |
print(f"Loading the delta weights from {delta_path}") | |
delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False) | |
delta = LlamaForCausalLM.from_pretrained( | |
delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
) | |
print(f"Loading the base model from {base_model_path}") | |
base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False) | |
base = LlamaForCausalLM.from_pretrained( | |
base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
) | |
# following alpaca training recipe, we have added new initialized tokens | |
DEFAULT_PAD_TOKEN = "[PAD]" | |
DEFAULT_EOS_TOKEN = "</s>" | |
DEFAULT_BOS_TOKEN = "<s>" | |
DEFAULT_UNK_TOKEN = "<unk>" | |
special_tokens_dict = { | |
"pad_token": DEFAULT_PAD_TOKEN, | |
"eos_token": DEFAULT_EOS_TOKEN, | |
"bos_token": DEFAULT_BOS_TOKEN, | |
"unk_token": DEFAULT_UNK_TOKEN, | |
} | |
num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict) | |
base.resize_token_embeddings(len(base_tokenizer)) | |
input_embeddings = base.get_input_embeddings().weight.data | |
output_embeddings = base.get_output_embeddings().weight.data | |
input_embeddings[-num_new_tokens:] = 0 | |
output_embeddings[-num_new_tokens:] = 0 | |
print("Applying the delta") | |
target_weights = {} | |
for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): | |
assert name in delta.state_dict() | |
param.data += delta.state_dict()[name] | |
target_weights[name] = param.data | |
print(f"Saving the target model to {target_model_path}") | |
base.load_state_dict(target_weights) | |
# base.save_pretrained(target_model_path) | |
# delta_tokenizer.save_pretrained(target_model_path) | |
delta = None | |
return base, delta_tokenizer | |
base_weights = 'decapoda-research/llama-7b-hf' | |
target_weights = 'expertllama' # local path | |
delta_weights = 'OFA-Sys/expertllama-7b-delta' | |
model, tokenizer = apply_delta(base_weights, target_weights, delta_weights) | |
model = model.to(torch.float) | |
if torch.__version__ >= "2": | |
model = torch.compile(model) | |
def respond( | |
instruction, | |
temperature=0.1, | |
top_p=0.75, | |
top_k=40, | |
num_beams=4, | |
max_new_tokens=128, | |
**kwargs, | |
): | |
# prompt wrapper, only single-turn is allowed for now | |
prompt = f"### Human:\n{instruction}\n\n### Assistant:\n" | |
inputs = tokenizer( | |
prompt, | |
return_tensors="pt", | |
add_special_tokens=False | |
) | |
generation_config = GenerationConfig( | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
**kwargs, | |
) | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids=inputs["input_ids"], | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=max_new_tokens, | |
) | |
response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1] | |
return response | |
g = gr.Interface( | |
fn=respond, | |
inputs=[ | |
gr.components.Textbox( | |
lines=2, label="Instruction" | |
), | |
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), | |
gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), | |
gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), | |
gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), | |
gr.components.Slider( | |
minimum=1, maximum=768, step=1, value=512, label="Max tokens" | |
), | |
], | |
outputs=[ | |
gr.inputs.Textbox( | |
lines=8, | |
label="Output", | |
) | |
], | |
title="ExpertLLaMA", | |
description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.", | |
) | |
g.queue(concurrency_count=1) | |
g.launch() |