|
import os |
|
import torch |
|
import gradio as gr |
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "0" |
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" |
|
|
|
|
|
model = None |
|
tokenizer = None |
|
|
|
def load_model_and_tokenizer(model_name, dtype, kv_bits): |
|
global model, tokenizer |
|
if model is None or tokenizer is None: |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
special_tokens = {"pad_token": "<PAD>"} |
|
tokenizer.add_special_tokens(special_tokens) |
|
|
|
config = AutoConfig.from_pretrained(model_name) |
|
if kv_bits != "unquantized": |
|
quantizer_path = os.path.join("codebooks", f"{model_name.split('/')[-1]}_{kv_bits}bit.xmad") |
|
setattr(config, "quantizer_path", quantizer_path) |
|
|
|
dtype = torch.__dict__.get(dtype, torch.float32) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto") |
|
|
|
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
tokenizer.padding_side = "left" |
|
model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
return model, tokenizer |
|
|
|
|
|
load_model_and_tokenizer("NousResearch/Hermes-2-Theta-Llama-3-8B", "fp16", "1") |
|
|
|
def respond(message, history, system_message, max_tokens, temperature, top_p): |
|
messages = [{"role": "system", "content": system_message}] |
|
for val in history: |
|
if val[0]: |
|
messages.append({"role": "user", "content": val[0]}) |
|
if val[1]: |
|
messages.append({"role": "assistant", "content": val[1]}) |
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
tokenized_input_prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) |
|
|
|
response = "" |
|
try: |
|
with torch.no_grad(): |
|
while len(response.split()) < max_tokens: |
|
output = model.generate( |
|
tokenized_input_prompt_ids, |
|
max_new_tokens=1, |
|
temperature=temperature, |
|
top_p=top_p, |
|
do_sample=True, |
|
eos_token_id=tokenizer.eos_token_id, |
|
pad_token_id=tokenizer.pad_token_id, |
|
return_dict_in_generate=True, |
|
output_scores=True, |
|
) |
|
next_token_id = output.sequences[:, -1:] |
|
tokenized_input_prompt_ids = torch.cat([tokenized_input_prompt_ids, next_token_id], dim=1) |
|
token = tokenizer.decode(next_token_id[0], skip_special_tokens=True) |
|
response += token |
|
yield response |
|
|
|
if tokenizer.eos_token_id in next_token_id: |
|
break |
|
except Exception as e: |
|
yield f"Error: {str(e)}" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
respond, |
|
additional_inputs=[ |
|
gr.Textbox(value="You are a friendly Chatbot.", label="System message"), |
|
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
|
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), |
|
], |
|
theme="default", |
|
title="1bit Llama3 by xMAD.ai", |
|
description=""" |
|
Welcome to the future of AI with xMAD.ai's 1bit Llama3, a breakthrough in Large Language Model (LLM) quantization and efficiency. Our cutting-edge technology offers: |
|
|
|
1. **Unmatched Speed**: Achieve an impressive 800 tokens per second on NVIDIA V100 and 1200 tokens per second on NVIDIA A100. |
|
2. **Cost Efficiency**: Slash your cloud hosting expenses by up to 90% with our highly optimized models, delivering significant savings for enterprises. |
|
3. **Scalability**: Support for up to 10x the number of concurrent users without compromising performance, ensuring seamless user experiences. |
|
4. **Memory Savings**: Experience 7x memory reduction, allowing you to run powerful LLMs on standard hardware. |
|
5. **Democratization of AI**: Make advanced LLMs accessible for various applications, from customer service to content creation, all while maintaining high accuracy and reliability. |
|
|
|
Our Llama3 model is the first in the industry to achieve 1-bit quantization without any loss in model performance. This innovation enables businesses to deploy robust AI solutions locally or in the cloud with minimal overhead. |
|
|
|
Explore the potential of Llama3 with our interactive demo, where you can see real-time text generation and understand how our technology can transform your operations. Whether you are looking to enhance your chatbot capabilities, streamline your operations, or cut down on AI deployment costs, xMAD.ai offers a solution that scales with your needs. |
|
|
|
Join us in redefining AI efficiency and cost-effectiveness. Try the demo now and see the difference! |
|
""", |
|
css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=False) |
|
|