Spaces:

ezcz
/

bright-llama-chatbot

Running

App Files Files Community

ezcz commited on 5 days ago

Commit

c3e099a

•

1 Parent(s): f91fde9

Update space

Browse files

Files changed (1) hide show

app.py +30 -31

app.py CHANGED Viewed

@@ -1,71 +1,70 @@
 import torch
 import gradio as gr
-from transformers import pipeline
 import logging
-import warnings
-from threading import Lock
-# Suppress non-critical warnings
-warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Model configuration
-MODEL_ID = "ezcz/bright-llama-3b-chat"
-MAX_NEW_TOKENS = 256
-TEMPERATURE = 0.1
-TOP_P = 0.9
-TOP_K = 60
-REPETITION_PENALTY = 1.0
 # Check for GPU availability
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
-# Load the pipeline
-logger.info("Loading model pipeline...")
 pipe = pipeline(
     "text-generation",
-    model=MODEL_ID,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
 )
-# Define the chat interface
 def chat_interface(user_input, history=None):
     if history is None:
         history = []
     messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
     try:
         outputs = pipe(
             messages,
-            max_new_tokens=MAX_NEW_TOKENS,
-            temperature=TEMPERATURE,
-            top_p=TOP_P,
-            top_k=TOP_K,
-            repetition_penalty=REPETITION_PENALTY,
         )
         response = outputs[0]["generated_text"]
         history.append((user_input, response))
         return "", history
     except Exception as e:
-        logger.error(f"Error during response generation: {e}")
         return "Error generating response.", history
-# Define the Gradio interface
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot()
     user_input = gr.Textbox(placeholder="Type your message...")
-    clear_button = gr.Button("Clear Chat")
     submit_button = gr.Button("Send")
     submit_button.click(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
-    user_input.submit(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
-    clear_button.click(lambda: ([], ""), inputs=[], outputs=[chatbot, user_input])
-# Launch the UI
-demo.queue().launch(debug=True, share=True)

 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from peft import PeftModel
+import os
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Model configuration
+BASE_MODEL_ID = "unsloth/Llama-3.2-3B-instruct"  # The base model you fine-tuned
+ADAPTER_MODEL_ID = "ezcz/bright-llama-3b-chat"
 # Check for GPU availability
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
+# Load the base model and apply the adapter
+logger.info("Loading base model...")
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL_ID,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
+)
+# Load the adapter model on top of the base model
+logger.info("Loading adapter weights...")
+model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
+# Create the pipeline with the combined model
 pipe = pipeline(
     "text-generation",
+    model=model,
+    tokenizer=tokenizer,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
 )
 def chat_interface(user_input, history=None):
     if history is None:
         history = []
     messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
     try:
         outputs = pipe(
             messages,
+            max_new_tokens=256,
+            temperature=0.1,
+            top_p=0.9,
+            top_k=60,
+            repetition_penalty=1.0
         )
         response = outputs[0]["generated_text"]
         history.append((user_input, response))
         return "", history
     except Exception as e:
+        logger.error(f"Error generating response: {e}")
         return "Error generating response.", history
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot()
     user_input = gr.Textbox(placeholder="Type your message...")
     submit_button = gr.Button("Send")
     submit_button.click(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
+demo.launch(debug=True, share=True)