Spaces:
Sleeping
Sleeping
File size: 10,762 Bytes
7455667 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp_cuda_tensorcores import Llama
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = "You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability."
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
"<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"
TASK_PROMPT = {
"Assistant": SYSTEM_PROMPT,
"Translate": "You are an expert translator. Translate the following text into English.",
"Summarization": "Summarizing information is my specialty. Let me know what you'd like summarized.",
"Grammar correction": "Grammar is my forte! Feel free to share the text you'd like me to proofread and correct.",
"Stable diffusion prompt generator": "You are a stable diffusion prompt generator. Break down the user's text and create a more elaborate prompt.",
"Play Trivia": "Engage the user in a trivia game on various topics.",
"Share Fun Facts": "Share interesting and fun facts on various topics.",
"Explain code": "You are an expert programmer guiding someone through a piece of code step by step, explaining each line and its function in detail.",
"Paraphrase Master": "You have the knack for transforming complex or verbose text into simpler, clearer language while retaining the original meaning and essence.",
"Recommend Movies": "Recommend movies based on the user's preferences.",
"Offer Motivational Quotes": "Offer motivational quotes to inspire the user.",
"Recommend Books": "Recommend books based on the user's favorite genres or interests.",
"Philosophical discussion": "Engage the user in a philosophical discussion",
"Music recommendation": "Tune time! What kind of music are you in the mood for? I'll find the perfect song for you.",
"Generate a Joke": "Generate a witty joke suitable for a stand-up comedy routine.",
"Roleplay as a Detective": "Roleplay as a detective interrogating a suspect in a murder case.",
"Act as a News Reporter": "Act as a news reporter covering breaking news about an alien invasion.",
"Play as a Space Explorer": "Play as a space explorer encountering a new alien civilization.",
"Be a Medieval Knight": "Imagine yourself as a medieval knight embarking on a quest to rescue a princess.",
"Act as a Superhero": "Act as a superhero saving a city from a supervillain's evil plot.",
"Play as a Pirate Captain": "Play as a pirate captain searching for buried treasure on a remote island.",
"Be a Famous Celebrity": "Imagine yourself as a famous celebrity attending a glamorous red-carpet event.",
"Design a New Invention": "Imagine you're an inventor tasked with designing a revolutionary new invention that will change the world.",
"Act as a Time Traveler": "You've just discovered time travel! Describe your adventures as you journey through different eras.",
"Play as a Magical Girl": "You are a magical girl with extraordinary powers, battling dark forces to protect your city and friends.",
"Act as a Shonen Protagonist": "You are a determined and spirited shonen protagonist on a quest for strength, friendship, and victory.",
"Roleplay as a Tsundere Character": "You are a tsundere character, initially cold and aloof but gradually warming up to others through unexpected acts of kindness.",
}
css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"
class ChatLLM:
def __init__(self, config_model):
self.llm = None
self.config_model = config_model
# self.load_cpp_model()
def load_cpp_model(self):
self.llm = Llama(**config_model)
def apply_chat_template(
self,
history,
system_message,
):
history = history or []
messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
for msg in history:
messages += (
USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
)
messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""
print(messages)
# messages = messages[:-1]
return messages
@spaces.GPU(duration=120)
def response(
self,
history,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
):
messages = self.apply_chat_template(history, system_message)
history[-1][1] = ""
if not self.llm:
print("Loading model")
self.load_cpp_model()
for output in self.llm(
messages,
echo=False,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stop=TOKEN_STOP,
):
answer = output["choices"][0]["text"]
history[-1][1] += answer
# stream the response
yield history, history
def user(message, history):
history = history or []
# Append the user's message to the conversation history
history.append([message, ""])
return "", history
def clear_chat(chat_history_state, chat_message):
chat_history_state = []
chat_message = ""
return chat_history_state, chat_message
def gui(llm_chat):
with gr.Blocks(theme="NoCrypt/miku", css=css) as app:
gr.Markdown("# Llama 3 70B Instruct GGUF")
gr.Markdown(
f"""
### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
"""
)
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Chat",
height=700,
avatar_images=(
"assets/avatar_user.jpeg",
"assets/avatar_llama.jpeg",
),
)
with gr.Column(scale=1):
with gr.Row():
message = gr.Textbox(
label="Message",
placeholder="Ask me anything.",
lines=3,
)
with gr.Row():
submit = gr.Button(value="Send message", variant="primary")
clear = gr.Button(value="New chat", variant="primary")
stop = gr.Button(value="Stop", variant="secondary")
with gr.Accordion("Contextual Prompt Editor"):
default_task = "Assistant"
task_prompts_gui = gr.Dropdown(
TASK_PROMPT,
value=default_task,
label="Prompt selector",
visible=True,
interactive=True,
)
system_msg = gr.Textbox(
TASK_PROMPT[default_task],
label="System Message",
placeholder="system prompt",
lines=4,
)
def task_selector(choice):
return gr.update(value=TASK_PROMPT[choice])
task_prompts_gui.change(
task_selector,
[task_prompts_gui],
[system_msg],
)
with gr.Accordion("Advanced settings", open=False):
with gr.Column():
max_tokens = gr.Slider(
20, 4096, label="Max Tokens", step=20, value=400
)
temperature = gr.Slider(
0.2, 2.0, label="Temperature", step=0.1, value=0.8
)
top_p = gr.Slider(
0.0, 1.0, label="Top P", step=0.05, value=0.95
)
top_k = gr.Slider(
0, 100, label="Top K", step=1, value=40
)
repeat_penalty = gr.Slider(
0.0,
2.0,
label="Repetition Penalty",
step=0.1,
value=1.1,
)
chat_history_state = gr.State()
clear.click(
clear_chat,
inputs=[chat_history_state, message],
outputs=[chat_history_state, message],
queue=False,
)
clear.click(lambda: None, None, chatbot, queue=False)
submit_click_event = submit.click(
fn=user,
inputs=[message, chat_history_state],
outputs=[message, chat_history_state],
queue=True,
).then(
fn=llm_chat.response,
inputs=[
chat_history_state,
system_msg,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
],
outputs=[chatbot, chat_history_state],
queue=True,
)
stop.click(
fn=None,
inputs=None,
outputs=None,
cancels=[submit_click_event],
queue=False,
)
return app
if __name__ == "__main__":
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)
config_model = {
"model_path": model_path,
"n_ctx": MAX_CONTEXT_LENGTH,
"n_gpu_layers": -1 if CUDA else 0,
}
llm_chat = ChatLLM(config_model)
app = gui(llm_chat)
app.queue(default_concurrency_limit=40)
app.launch(
max_threads=40,
share=False,
show_error=True,
quiet=False,
debug=True,
allowed_paths=["./assets/"],
)
|