Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Sep 11

Commit

a6ecdfa

•

1 Parent(s): 32b7e17

Upload 3 files

Browse files

Files changed (3) hide show

App_Function_Libraries/RAG/Embeddings_Create.py +167 -0
App_Function_Libraries/RAG/RAG_Libary_2.py +332 -0
App_Function_Libraries/RAG/RAG_QA_Chat.py +84 -0

App_Function_Libraries/RAG/Embeddings_Create.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Embeddings_Create.py
+# Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
+#
+# Imports:
+import logging
+from typing import List, Dict, Any
+import numpy as np
+#
+# 3rd-Party Imports:
+import requests
+from transformers import AutoTokenizer, AutoModel
+import torch
+#
+# Local Imports:
+from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
+from App_Function_Libraries.Summarization_General_Lib import summarize
+from App_Function_Libraries.Utils.Utils import load_comprehensive_config
+from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process, determine_chunk_position
+#
+#
+#######################################################################################################################
+#
+# Functions:
+# FIXME - Add all globals to summarize.py
+loaded_config = load_comprehensive_config()
+embedding_provider = loaded_config['Embeddings']['embedding_provider']
+embedding_model = loaded_config['Embeddings']['embedding_model']
+embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
+embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
+# Embedding Chunking Settings
+chunk_size = loaded_config['Embeddings']['chunk_size']
+overlap = loaded_config['Embeddings']['overlap']
+# FIXME - Add logging
+# FIXME - refactor/setup to use config file & perform chunking
+def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]:
+    try:
+        if provider == 'openai':
+            embedding = get_openai_embeddings(text, model)
+        elif provider == 'local':
+            embedding = create_local_embedding(text, model, api_url, api_key)
+        elif provider == 'huggingface':
+            embedding = create_huggingface_embedding(text, model)
+        elif provider == 'llamacpp':
+            embedding = create_llamacpp_embedding(text, api_url)
+        else:
+            raise ValueError(f"Unsupported embedding provider: {provider}")
+        if isinstance(embedding, np.ndarray):
+            embedding = embedding.tolist()
+        elif isinstance(embedding, torch.Tensor):
+            embedding = embedding.detach().cpu().numpy().tolist()
+        return embedding
+    except Exception as e:
+        logging.error(f"Error creating embedding: {str(e)}")
+        raise
+def create_huggingface_embedding(text: str, model: str) -> List[float]:
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    model = AutoModel.from_pretrained(model)
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    return embeddings[0].tolist()
+# FIXME
+def create_stella_embeddings(text: str) -> List[float]:
+    if embedding_provider == 'local':
+        # Load the model and tokenizer
+        tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
+        model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
+        # Tokenize and encode the text
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        # Generate embeddings
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Use the mean of the last hidden state as the sentence embedding
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        return embeddings[0].tolist()  # Convert to list for consistency
+    elif embedding_provider == 'openai':
+        return get_openai_embeddings(text, embedding_model)
+    else:
+        raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
+def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
+    response = requests.post(
+        api_url,
+        json={"input": text}
+    )
+    response.raise_for_status()
+    return response.json()['embedding']
+def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
+    response = requests.post(
+        api_url,
+        json={"text": text, "model": model},
+        headers={"Authorization": f"Bearer {api_key}"}
+    )
+    response.raise_for_status()
+    return response.json().get('embedding', None)
+def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+    options = chunk_options.copy()
+    if custom_chunk_options:
+        options.update(custom_chunk_options)
+    # FIXME
+    if api_name is not None:
+        # Generate summary of the full document
+        full_summary = summarize(text, None, api_name, None, None, None)
+    else:
+        full_summary = "Full document summary not available."
+    chunks = improved_chunking_process(text, options)
+    total_chunks = len(chunks)
+    chunked_text_with_headers = []
+    for i, chunk in enumerate(chunks, 1):
+        chunk_text = chunk['text']
+        chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
+        chunk_header = f"""
+        Original Document: {file_name}
+        Full Document Summary: {full_summary}
+        Chunk: {i} of {total_chunks}
+        Position: {chunk_position}
+        --- Chunk Content ---
+        """
+        full_chunk_text = chunk_header + chunk_text
+        chunk['text'] = full_chunk_text
+        chunk['metadata']['file_name'] = file_name
+        chunked_text_with_headers.append(chunk)
+    return chunked_text_with_headers
+def create_openai_embedding(text: str, model: str) -> List[float]:
+    embedding = get_openai_embeddings(text, model)
+    return embedding
+#
+# End of File.
+#######################################################################################################################

App_Function_Libraries/RAG/RAG_Libary_2.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# RAG_Library_2.py
+# Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
+#
+# Import necessary modules and functions
+import configparser
+import logging
+import os
+from typing import Dict, Any, List, Optional
+# Local Imports
+from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
+from App_Function_Libraries.Article_Extractor_Lib import scrape_article
+from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
+    fetch_keywords_for_media
+from App_Function_Libraries.Utils.Utils import load_comprehensive_config
+#
+# 3rd-Party Imports
+import openai
+#
+########################################################################################################################
+#
+# Functions:
+# Initialize OpenAI client (adjust this based on your API key management)
+openai.api_key = "your-openai-api-key"
+# Get the directory of the current script
+current_dir = os.path.dirname(os.path.abspath(__file__))
+# Construct the path to the config file
+config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
+# Read the config file
+config = configparser.ConfigParser()
+# Read the configuration file
+config.read('config.txt')
+# Main RAG pipeline function
+def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
+    try:
+        # Extract content
+        try:
+            article_data = scrape_article(url)
+            content = article_data['content']
+            title = article_data['title']
+        except Exception as e:
+            logging.error(f"Error scraping article: {str(e)}")
+            return {"error": "Failed to scrape article", "details": str(e)}
+        # Store the article in the database and get the media_id
+        try:
+            media_id = add_media_to_database(url, title, 'article', content)
+        except Exception as e:
+            logging.error(f"Error adding article to database: {str(e)}")
+            return {"error": "Failed to store article in database", "details": str(e)}
+        # Process and store content
+        collection_name = f"article_{media_id}"
+        try:
+            process_and_store_content(content, collection_name, media_id, title)
+        except Exception as e:
+            logging.error(f"Error processing and storing content: {str(e)}")
+            return {"error": "Failed to process and store content", "details": str(e)}
+        # Perform searches
+        try:
+            vector_results = vector_search(collection_name, query, k=5)
+            fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
+        except Exception as e:
+            logging.error(f"Error performing searches: {str(e)}")
+            return {"error": "Failed to perform searches", "details": str(e)}
+        # Combine results with error handling for missing 'content' key
+        all_results = []
+        for result in vector_results + fts_results:
+            if isinstance(result, dict) and 'content' in result:
+                all_results.append(result['content'])
+            else:
+                logging.warning(f"Unexpected result format: {result}")
+                all_results.append(str(result))
+        context = "\n".join(all_results)
+        # Generate answer using the selected API
+        try:
+            answer = generate_answer(api_choice, context, query)
+        except Exception as e:
+            logging.error(f"Error generating answer: {str(e)}")
+            return {"error": "Failed to generate answer", "details": str(e)}
+        return {
+            "answer": answer,
+            "context": context
+        }
+    except Exception as e:
+        logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
+        return {"error": "An unexpected error occurred", "details": str(e)}
+# RAG Search with keyword filtering
+def enhanced_rag_pipeline(query: str, api_choice: str, keywords: str = None) -> Dict[str, Any]:
+    try:
+        # Load embedding provider from config, or fallback to 'openai'
+        embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
+        # Log the provider used
+        logging.debug(f"Using embedding provider: {embedding_provider}")
+        # Process keywords if provided
+        keyword_list = [k.strip().lower() for k in keywords.split(',')] if keywords else []
+        logging.debug(f"enhanced_rag_pipeline - Keywords: {keyword_list}")
+        # Fetch relevant media IDs based on keywords if keywords are provided
+        relevant_media_ids = fetch_relevant_media_ids(keyword_list) if keyword_list else None
+        logging.debug(f"enhanced_rag_pipeline - relevant media IDs: {relevant_media_ids}")
+        # Perform vector search
+        vector_results = perform_vector_search(query, relevant_media_ids)
+        logging.debug(f"enhanced_rag_pipeline - Vector search results: {vector_results}")
+        # Perform full-text search
+        fts_results = perform_full_text_search(query, relevant_media_ids)
+        logging.debug(f"enhanced_rag_pipeline - Full-text search results: {fts_results}")
+        # Combine results
+        all_results = vector_results + fts_results
+        # FIXME
+        if not all_results:
+            logging.info(f"No results found. Query: {query}, Keywords: {keywords}")
+            return {
+                "answer": "I couldn't find any relevant information based on your query and keywords.",
+                "context": ""
+            }
+        # FIXME - Apply Re-Ranking of results here
+        apply_re_ranking = False
+        if apply_re_ranking:
+            # Implement re-ranking logic here
+            pass
+        # Extract content from results
+        context = "\n".join([result['content'] for result in all_results[:10]])  # Limit to top 10 results
+        logging.debug(f"Context length: {len(context)}")
+        logging.debug(f"Context: {context[:200]}")
+        # Generate answer using the selected API
+        answer = generate_answer(api_choice, context, query)
+        return {
+            "answer": answer,
+            "context": context
+        }
+    except Exception as e:
+        logging.error(f"Error in enhanced_rag_pipeline: {str(e)}")
+        return {
+            "answer": "An error occurred while processing your request.",
+            "context": ""
+        }
+def generate_answer(api_choice: str, context: str, query: str) -> str:
+    logging.debug("Entering generate_answer function")
+    config = load_comprehensive_config()
+    logging.debug(f"Config sections: {config.sections()}")
+    prompt = f"Context: {context}\n\nQuestion: {query}"
+    if api_choice == "OpenAI":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai
+        return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
+    elif api_choice == "Anthropic":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic
+        return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
+    elif api_choice == "Cohere":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere
+        return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
+    elif api_choice == "Groq":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq
+        return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
+    elif api_choice == "OpenRouter":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter
+        return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
+    elif api_choice == "HuggingFace":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface
+        return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
+    elif api_choice == "DeepSeek":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek
+        return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
+    elif api_choice == "Mistral":
+        from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral
+        return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
+    elif api_choice == "Local-LLM":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm
+        return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
+    elif api_choice == "Llama.cpp":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama
+        return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
+    elif api_choice == "Kobold":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold
+        return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
+    elif api_choice == "Ooba":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga
+        return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
+    elif api_choice == "TabbyAPI":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi
+        return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
+    elif api_choice == "vLLM":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm
+        return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
+    elif api_choice == "ollama":
+        from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama
+        return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
+    else:
+        raise ValueError(f"Unsupported API choice: {api_choice}")
+# Function to preprocess and store all existing content in the database
+def preprocess_all_content():
+    unprocessed_media = get_unprocessed_media()
+    for row in unprocessed_media:
+        media_id = row[0]
+        content = row[1]
+        media_type = row[2]
+        collection_name = f"{media_type}_{media_id}"
+        process_and_store_content(content, collection_name, media_id, "")
+def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
+    all_collections = chroma_client.list_collections()
+    vector_results = []
+    for collection in all_collections:
+        collection_results = vector_search(collection.name, query, k=5)
+        filtered_results = [
+            result for result in collection_results
+            if relevant_media_ids is None or result['metadata'].get('media_id') in relevant_media_ids
+        ]
+        vector_results.extend(filtered_results)
+    return vector_results
+def perform_full_text_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
+    fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
+    filtered_fts_results = [
+        {
+            "content": result['content'],
+            "metadata": {"media_id": result['id']}
+        }
+        for result in fts_results
+        if relevant_media_ids is None or result['id'] in relevant_media_ids
+    ]
+    return filtered_fts_results
+def fetch_relevant_media_ids(keywords: List[str]) -> List[int]:
+    relevant_ids = set()
+    try:
+        for keyword in keywords:
+            media_ids = fetch_keywords_for_media(keyword)
+            relevant_ids.update(media_ids)
+    except Exception as e:
+        logging.error(f"Error fetching relevant media IDs: {str(e)}")
+    return list(relevant_ids)
+def filter_results_by_keywords(results: List[Dict[str, Any]], keywords: List[str]) -> List[Dict[str, Any]]:
+    if not keywords:
+        return results
+    filtered_results = []
+    for result in results:
+        try:
+            metadata = result.get('metadata', {})
+            if metadata is None:
+                logging.warning(f"No metadata found for result: {result}")
+                continue
+            if not isinstance(metadata, dict):
+                logging.warning(f"Unexpected metadata type: {type(metadata)}. Expected dict.")
+                continue
+            media_id = metadata.get('media_id')
+            if media_id is None:
+                logging.warning(f"No media_id found in metadata: {metadata}")
+                continue
+            media_keywords = fetch_keywords_for_media(media_id)
+            if any(keyword.lower() in [mk.lower() for mk in media_keywords] for keyword in keywords):
+                filtered_results.append(result)
+        except Exception as e:
+            logging.error(f"Error processing result: {result}. Error: {str(e)}")
+    return filtered_results
+# FIXME: to be implememted
+def extract_media_id_from_result(result: str) -> Optional[int]:
+    # Implement this function based on how you store the media_id in your results
+    # For example, if it's stored at the beginning of each result:
+    try:
+        return int(result.split('_')[0])
+    except (IndexError, ValueError):
+        logging.error(f"Failed to extract media_id from result: {result}")
+        return None
+# Example usage:
+# 1. Initialize the system:
+# create_tables(db)  # Ensure FTS tables are set up
+#
+# 2. Create ChromaDB
+# chroma_client = ChromaDBClient()
+#
+# 3. Create Embeddings
+# Store embeddings in ChromaDB
+# preprocess_all_content() or create_embeddings()
+#
+# 4. Perform RAG search across all content:
+# result = rag_search("What are the key points about climate change?")
+# print(result['answer'])
+#
+# (Extra)5. Perform RAG on a specific URL:
+# result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
+# print(result['answer'])
+#
+########################################################################################################################
+############################################################################################################
+#
+# ElasticSearch Retriever
+# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
+#
+# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
+#
+# End of RAG_Library_2.py
+############################################################################################################

App_Function_Libraries/RAG/RAG_QA_Chat.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Podcast_tab.py
+# Description: Gradio UI for ingesting podcasts into the database
+#
+# Imports
+#
+#
+# External Imports
+import json
+import logging
+import tempfile
+from typing import List, Tuple, IO, Union
+#
+# Local Imports
+from App_Function_Libraries.DB.DB_Manager import db, search_db, DatabaseError, get_media_content
+from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer
+#
+########################################################################################################################
+#
+# Functions:
+def rag_qa_chat(message: str, history: List[Tuple[str, str]], context: Union[str, IO[str]], api_choice: str) -> Tuple[List[Tuple[str, str]], str]:
+    try:
+        # Prepare the context based on the selected source
+        if hasattr(context, 'read'):
+            # Handle uploaded file
+            context_text = context.read()
+            if isinstance(context_text, bytes):
+                context_text = context_text.decode('utf-8')
+        elif isinstance(context, str) and context.startswith("media_id:"):
+            # Handle existing file or search result
+            media_id = int(context.split(":")[1])
+            context_text = get_media_content(media_id)  # Implement this function to fetch content from the database
+        else:
+            context_text = str(context)
+        # Prepare the full context including chat history
+        full_context = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
+        full_context += f"\n\nContext: {context_text}\n\nHuman: {message}\nAI:"
+        # Generate response using the selected API
+        response = generate_answer(api_choice, full_context, message)
+        # Update history
+        history.append((message, response))
+        return history, ""
+    except DatabaseError as e:
+        logging.error(f"Database error in rag_qa_chat: {str(e)}")
+        return history, f"An error occurred while accessing the database: {str(e)}"
+    except Exception as e:
+        logging.error(f"Unexpected error in rag_qa_chat: {str(e)}")
+        return history, f"An unexpected error occurred: {str(e)}"
+def save_chat_history(history: List[Tuple[str, str]]) -> str:
+    # Save chat history to a file
+    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
+        json.dump(history, temp_file)
+        return temp_file.name
+def load_chat_history(file: IO[str]) -> List[Tuple[str, str]]:
+    # Load chat history from a file
+    return json.load(file)
+def search_database(query: str) -> List[Tuple[int, str]]:
+    # Implement database search functionality
+    results = search_db(query, ["title", "content"], "", page=1, results_per_page=10)
+    return [(result['id'], result['title']) for result in results]
+def get_existing_files() -> List[Tuple[int, str]]:
+    # Fetch list of existing files from the database
+    with db.get_connection() as conn:
+        cursor = conn.cursor()
+        cursor.execute("SELECT id, title FROM Media ORDER BY title")
+        return cursor.fetchall()
+#
+# End of RAG_QA_Chat.py
+########################################################################################################################