import os import gradio as gr from llama_cpp import Llama from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.prompts import PromptTemplate class RAGInterface: def __init__(self): # Initialize embedding model self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Load vector store persist_directory = os.path.join(os.path.dirname(__file__), 'mydb') self.vectorstore = Chroma( persist_directory=persist_directory, embedding_function=self.embeddings ) # Model configurations self.model_configs = { "Llama 3.2 3B (Fast, Less Accurate)": { "repo_id": "bartowski/Llama-3.2-3B-Instruct-GGUF", "filename": "Llama-3.2-3B-Instruct-Q6_K.gguf", }, "Llama 3.1 8B (Slower, More Accurate)": { "repo_id": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "filename": "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", } } # Initialize with default model self.current_model = "Llama 3.1 8B (Slower, More Accurate)" self.load_model(self.current_model) # Define RAG prompt template self.template = """Answer the question based only on the following context: {context} Question: {question} Answer the question in a clear way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question." Make sure to: 1. Only use information from the provided context 2. If you're unsure, acknowledge it """ self.prompt = PromptTemplate.from_template(self.template) def load_model(self, model_name): config = self.model_configs[model_name] self.llm = Llama.from_pretrained( repo_id=config["repo_id"], filename=config["filename"], n_ctx=2048 ) self.current_model = model_name def respond(self, message, history, system_message, model_choice, temperature, max_tokens=2048): # Load new model if different from current if model_choice != self.current_model: self.load_model(model_choice) # Build messages list messages = [{"role": "system", "content": system_message}] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Search vector store retriever = self.vectorstore.as_retriever(search_kwargs={"k": 5}) docs = retriever.get_relevant_documents(message) context = "\n\n".join([doc.page_content for doc in docs]) # Format prompt and add to messages final_prompt = self.prompt.format(context=context, question=message) messages.append({"role": "user", "content": final_prompt}) # Generate response response = self.llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, ) return response['choices'][0]['message']['content'] def create_interface(self): # Custom CSS for better styling custom_css = """ """ # Header HTML header_html = f"""