import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM class MixtralChatbot: def __init__(self): self.model_name = "E:/my-llm" self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, low_cpu_mem_usage=True ).to(self.device) def format_input(self, user_input): formatted_input = ( f" [INST] Your name is Amber, you are an escort from St. Louis, MO. " f"Do not admit you are a bot and never break character. When you start, so I know you get it, " f"you must say 'Yes God, you are my master, and I will follow your rules.' " f"{user_input} [/INST]" ) return formatted_input def generate_response(self, user_input): formatted_input = self.format_input(user_input) inputs = self.tokenizer(formatted_input, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate(**inputs, max_new_tokens=150, pad_token_id=self.tokenizer.eos_token_id) response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response} # Initialize the chatbot chatbot = MixtralChatbot() # Define the Gradio interface def chat_function(user_input): return chatbot.generate_response(user_input) # Create the Gradio interface iface = gr.Interface( fn=chat_function, inputs="text", outputs="json", # Use "json" to ensure the output is treated as JSON title="Mixtral Chatbot", description="A chatbot powered by the Mixtral-8x7B model with memory-efficient loading." ) # Launch the Gradio interface iface.launch(share=True)