Spaces:

ezcz
/

bright-llama-chatbot

Running

App Files Files Community

ezcz commited on 5 days ago

Commit

333aeab

•

1 Parent(s): 790457e

Update space

Browse files

Files changed (1) hide show

app.py +31 -30

app.py CHANGED Viewed

@@ -1,70 +1,71 @@
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from peft import PeftModel
-import os
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Model configuration
-BASE_MODEL_ID = "unsloth/Llama-3.2-3B-instruct"  # The base model you fine-tuned
-ADAPTER_MODEL_ID = "ezcz/bright-llama-3b-chat"
 # Check for GPU availability
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
-# Load the base model and apply the adapter
-logger.info("Loading base model...")
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
-base_model = AutoModelForCausalLM.from_pretrained(
-    BASE_MODEL_ID,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto"
-)
-# Load the adapter model on top of the base model
-logger.info("Loading adapter weights...")
-model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
-# Create the pipeline with the combined model
 pipe = pipeline(
     "text-generation",
-    model=model,
-    tokenizer=tokenizer,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto"
 )
 def chat_interface(user_input, history=None):
     if history is None:
         history = []
     messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
     try:
         outputs = pipe(
             messages,
-            max_new_tokens=256,
-            temperature=0.1,
-            top_p=0.9,
-            top_k=60,
-            repetition_penalty=1.0
         )
         response = outputs[0]["generated_text"]
         history.append((user_input, response))
         return "", history
     except Exception as e:
-        logger.error(f"Error generating response: {e}")
         return "Error generating response.", history
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot()
     user_input = gr.Textbox(placeholder="Type your message...")
     submit_button = gr.Button("Send")
     submit_button.click(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
-demo.launch(debug=True, share=True)

 import torch
 import gradio as gr
+from transformers import pipeline
 import logging
+import warnings
+from threading import Lock
+# Suppress non-critical warnings
+warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Model configuration
+MODEL_ID = "ezcz/bright-llama-3b-chat"
+MAX_NEW_TOKENS = 256
+TEMPERATURE = 0.1
+TOP_P = 0.9
+TOP_K = 60
+REPETITION_PENALTY = 1.0
 # Check for GPU availability
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
+# Load the pipeline
+logger.info("Loading model pipeline...")
 pipe = pipeline(
     "text-generation",
+    model=MODEL_ID,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
 )
+# Define the chat interface
 def chat_interface(user_input, history=None):
     if history is None:
         history = []
     messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
     try:
         outputs = pipe(
             messages,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+            top_k=TOP_K,
+            repetition_penalty=REPETITION_PENALTY,
         )
         response = outputs[0]["generated_text"]
         history.append((user_input, response))
         return "", history
     except Exception as e:
+        logger.error(f"Error during response generation: {e}")
         return "Error generating response.", history
+# Define the Gradio interface
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot()
     user_input = gr.Textbox(placeholder="Type your message...")
+    clear_button = gr.Button("Clear Chat")
     submit_button = gr.Button("Send")
     submit_button.click(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
+    user_input.submit(chat_interface, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
+    clear_button.click(lambda: ([], ""), inputs=[], outputs=[chatbot, user_input])
+# Launch the UI
+demo.queue().launch(debug=True, share=True)