File size: 4,411 Bytes
99660e9
0a8f6e9
 
99660e9
 
 
 
 
 
 
 
 
 
bfc290c
e8a3cd6
99660e9
 
 
 
 
 
 
 
 
 
 
 
7c3219b
99660e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f24ac
 
9869274
 
 
77f24ac
99660e9
 
 
 
 
77f24ac
9869274
99660e9
0a8f6e9
 
 
 
 
 
 
 
 
 
 
 
 
b6c667f
0a8f6e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99660e9
cd7462e
3052a7f
99660e9
0a8f6e9
 
 
 
 
60039c9
0a8f6e9
 
 
 
 
 
87d3353
0a8f6e9
 
 
 
60039c9
0a8f6e9
 
 
 
b6c667f
0a8f6e9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
CREDIT:
script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py).
'''

import gradio as gr
import random
import time
import transformers
import os
import json
import torch
import argparse
from tqdm import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig


def apply_delta(base_model_path, target_model_path, delta_path):
    print(f"Loading the delta weights from {delta_path}")
    delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False)
    delta = LlamaForCausalLM.from_pretrained(
        delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
    )

    print(f"Loading the base model from {base_model_path}")
    base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False)
    base = LlamaForCausalLM.from_pretrained(
        base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16
    )

    # following alpaca training recipe, we have added new initialized tokens
    DEFAULT_PAD_TOKEN = "[PAD]"
    DEFAULT_EOS_TOKEN = "</s>"
    DEFAULT_BOS_TOKEN = "<s>"
    DEFAULT_UNK_TOKEN = "<unk>"
    special_tokens_dict = {
        "pad_token": DEFAULT_PAD_TOKEN,
        "eos_token": DEFAULT_EOS_TOKEN,
        "bos_token": DEFAULT_BOS_TOKEN,
        "unk_token": DEFAULT_UNK_TOKEN,
    }
    num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict)
    base.resize_token_embeddings(len(base_tokenizer))
    input_embeddings = base.get_input_embeddings().weight.data
    output_embeddings = base.get_output_embeddings().weight.data

    input_embeddings[-num_new_tokens:] = 0
    output_embeddings[-num_new_tokens:] = 0

    print("Applying the delta")
    target_weights = {}
    for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
        assert name in delta.state_dict()
        param.data += delta.state_dict()[name]
        target_weights[name] = param.data

    print(f"Saving the target model to {target_model_path}")
    base.load_state_dict(target_weights)
    # base.save_pretrained(target_model_path)
    # delta_tokenizer.save_pretrained(target_model_path)

    delta = None
    
    return base, delta_tokenizer


base_weights = 'decapoda-research/llama-7b-hf'
target_weights = 'expertllama' # local path
delta_weights = 'OFA-Sys/expertllama-7b-delta'
model, tokenizer = apply_delta(base_weights, target_weights, delta_weights)
model = model.to(torch.float)

if torch.__version__ >= "2":
    model = torch.compile(model)
    
def respond(
    instruction,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    **kwargs,
):
    # prompt wrapper, only single-turn is allowed for now
    prompt = f"### Human:\n{instruction}\n\n### Assistant:\n"
    inputs = tokenizer(
        prompt,
        return_tensors="pt", 
        add_special_tokens=False
    )
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=inputs["input_ids"],
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1]
    return response

    
g = gr.Interface(
    fn=respond,
    inputs=[
        gr.components.Textbox(
            lines=2, label="Instruction"
        ),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
        gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
        gr.components.Slider(
            minimum=1, maximum=768, step=1, value=512, label="Max tokens"
        ),
    ],
    outputs=[
        gr.inputs.Textbox(
            lines=8,
            label="Output",
        )
    ],
    title="ExpertLLaMA",
    description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.",
)
g.queue(concurrency_count=1)
g.launch()