Spaces:
Runtime error
Runtime error
File size: 4,411 Bytes
99660e9 0a8f6e9 99660e9 bfc290c e8a3cd6 99660e9 7c3219b 99660e9 77f24ac 9869274 77f24ac 99660e9 77f24ac 9869274 99660e9 0a8f6e9 b6c667f 0a8f6e9 99660e9 cd7462e 3052a7f 99660e9 0a8f6e9 60039c9 0a8f6e9 87d3353 0a8f6e9 60039c9 0a8f6e9 b6c667f 0a8f6e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
'''
CREDIT:
script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py).
'''
import gradio as gr
import random
import time
import transformers
import os
import json
import torch
import argparse
from tqdm import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
def apply_delta(base_model_path, target_model_path, delta_path):
print(f"Loading the delta weights from {delta_path}")
delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False)
delta = LlamaForCausalLM.from_pretrained(
delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
print(f"Loading the base model from {base_model_path}")
base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False)
base = LlamaForCausalLM.from_pretrained(
base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
# following alpaca training recipe, we have added new initialized tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
special_tokens_dict = {
"pad_token": DEFAULT_PAD_TOKEN,
"eos_token": DEFAULT_EOS_TOKEN,
"bos_token": DEFAULT_BOS_TOKEN,
"unk_token": DEFAULT_UNK_TOKEN,
}
num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict)
base.resize_token_embeddings(len(base_tokenizer))
input_embeddings = base.get_input_embeddings().weight.data
output_embeddings = base.get_output_embeddings().weight.data
input_embeddings[-num_new_tokens:] = 0
output_embeddings[-num_new_tokens:] = 0
print("Applying the delta")
target_weights = {}
for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
assert name in delta.state_dict()
param.data += delta.state_dict()[name]
target_weights[name] = param.data
print(f"Saving the target model to {target_model_path}")
base.load_state_dict(target_weights)
# base.save_pretrained(target_model_path)
# delta_tokenizer.save_pretrained(target_model_path)
delta = None
return base, delta_tokenizer
base_weights = 'decapoda-research/llama-7b-hf'
target_weights = 'expertllama' # local path
delta_weights = 'OFA-Sys/expertllama-7b-delta'
model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
model = model.to(torch.float)
if torch.__version__ >= "2":
model = torch.compile(model)
def respond(
instruction,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=128,
**kwargs,
):
# prompt wrapper, only single-turn is allowed for now
prompt = f"### Human:\n{instruction}\n\n### Assistant:\n"
inputs = tokenizer(
prompt,
return_tensors="pt",
add_special_tokens=False
)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
with torch.no_grad():
generation_output = model.generate(
input_ids=inputs["input_ids"],
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1]
return response
g = gr.Interface(
fn=respond,
inputs=[
gr.components.Textbox(
lines=2, label="Instruction"
),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
gr.components.Slider(
minimum=1, maximum=768, step=1, value=512, label="Max tokens"
),
],
outputs=[
gr.inputs.Textbox(
lines=8,
label="Output",
)
],
title="ExpertLLaMA",
description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.",
)
g.queue(concurrency_count=1)
g.launch() |