Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Sep 12

Commit

aa1db93

•

1 Parent(s): cfb144b

Update App_Function_Libraries/RAG/ChromaDB_Library.py

Browse files

Files changed (1) hide show

App_Function_Libraries/RAG/ChromaDB_Library.py +289 -286

App_Function_Libraries/RAG/ChromaDB_Library.py CHANGED Viewed

@@ -1,287 +1,290 @@
-import configparser
-import logging
-import sqlite3
-from typing import List, Dict, Any
-import chromadb
-import requests
-from chromadb import Settings
-from App_Function_Libraries.Chunk_Lib import improved_chunking_process
-from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
-from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
-#######################################################################################################################
-#
-# Functions for ChromaDB
-# Get ChromaDB settings
-# Load configuration
-config = configparser.ConfigParser()
-config.read('config.txt')
-chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
-chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
-# Get embedding settings
-embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
-embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
-embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
-embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
-# Get chunking options
-chunk_options = {
-    'method': config.get('Chunking', 'method', fallback='words'),
-    'max_size': config.getint('Chunking', 'max_size', fallback=400),
-    'overlap': config.getint('Chunking', 'overlap', fallback=200),
-    'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
-    'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
-    'language': config.get('Chunking', 'language', fallback='english')
-}
-def auto_update_chroma_embeddings(media_id: int, content: str):
-    """
-    Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
-    :param media_id: The ID of the newly ingested media item
-    :param content: The content of the newly ingested media item
-    """
-    collection_name = f"media_{media_id}"
-    # Initialize or get the ChromaDB collection
-    collection = chroma_client.get_or_create_collection(name=collection_name)
-    # Check if embeddings already exist for this media_id
-    existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
-    if existing_embeddings and len(existing_embeddings) > 0:
-        logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
-    else:
-        # Process and store content if embeddings do not already exist
-        process_and_store_content(content, collection_name, media_id)
-        logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
-# Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
-def process_and_store_content(content: str, collection_name: str, media_id: int):
-    # Process the content into chunks
-    chunks = improved_chunking_process(content, chunk_options)
-    texts = [chunk['text'] for chunk in chunks]
-    # Generate embeddings for each chunk
-    embeddings = [create_embedding(text) for text in texts]
-    # Create unique IDs for each chunk using the media_id and chunk index
-    ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
-    # Store the texts, embeddings, and IDs in ChromaDB
-    store_in_chroma(collection_name, texts, embeddings, ids)
-    # Store the chunk metadata in SQLite
-    for i, chunk in enumerate(chunks):
-        add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
-    # Update the FTS table
-    update_fts_for_media(media_id)
-# Function to store documents and their embeddings in ChromaDB
-def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
-    collection = chroma_client.get_or_create_collection(name=collection_name)
-    collection.add(
-        documents=texts,
-        embeddings=embeddings,
-        ids=ids
-    )
-# Function to perform vector search using ChromaDB
-def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
-    query_embedding = create_embedding(query)
-    collection = chroma_client.get_collection(name=collection_name)
-    results = collection.query(
-        query_embeddings=[query_embedding],
-        n_results=k
-    )
-    return results['documents'][0]
-def create_embedding(text: str) -> List[float]:
-    global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
-    if embedding_provider == 'openai':
-        return get_openai_embeddings(text, embedding_model)
-    elif embedding_provider == 'local':
-        response = requests.post(
-            embedding_api_url,
-            json={"text": text, "model": embedding_model},
-            headers={"Authorization": f"Bearer {embedding_api_key}"}
-        )
-        return response.json()['embedding']
-    elif embedding_provider == 'huggingface':
-        from transformers import AutoTokenizer, AutoModel
-        import torch
-        tokenizer = AutoTokenizer.from_pretrained(embedding_model)
-        model = AutoModel.from_pretrained(embedding_model)
-        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Use the mean of the last hidden state as the sentence embedding
-        embeddings = outputs.last_hidden_state.mean(dim=1)
-        return embeddings[0].tolist()  # Convert to list for consistency
-    else:
-        raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
-def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
-    try:
-        all_content = get_all_content_from_database()
-        if not all_content:
-            return "No content found in the database."
-        texts_to_embed = []
-        embeddings_to_store = []
-        ids_to_store = []
-        collection_name = "all_content_embeddings"
-        # Initialize or get the ChromaDB collection
-        collection = chroma_client.get_or_create_collection(name=collection_name)
-        for content_item in all_content:
-            media_id = content_item['id']
-            text = content_item['content']
-            # Check if the embedding already exists in ChromaDB
-            embedding_exists = collection.get(ids=[f"doc_{media_id}"])
-            if embedding_exists:
-                logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
-                continue  # Skip if embedding already exists
-            # Create the embedding
-            if api_choice == "openai":
-                embedding = create_openai_embedding(text, model_or_url)
-            else:  # Llama.cpp
-                embedding = create_llamacpp_embedding(text, model_or_url)
-            # Collect the text, embedding, and ID for batch storage
-            texts_to_embed.append(text)
-            embeddings_to_store.append(embedding)
-            ids_to_store.append(f"doc_{media_id}")
-        # Store all new embeddings in ChromaDB
-        if texts_to_embed and embeddings_to_store:
-            store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
-        return "Embeddings created and stored successfully for all new content."
-    except Exception as e:
-        logging.error(f"Error during embedding creation: {str(e)}")
-        return f"Error: {str(e)}"
-def create_openai_embedding(text: str, model: str) -> List[float]:
-    openai_api_key = config['API']['openai_api_key']
-    embedding = get_openai_embeddings(text, model)
-    return embedding
-def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
-    response = requests.post(
-        api_url,
-        json={"input": text}
-    )
-    if response.status_code == 200:
-        return response.json()['embedding']
-    else:
-        raise Exception(f"Error from Llama.cpp API: {response.text}")
-def get_all_content_from_database() -> List[Dict[str, Any]]:
-    """
-    Retrieve all media content from the database that requires embedding.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
-    """
-    try:
-        from App_Function_Libraries.DB.DB_Manager import db
-        with db.get_connection() as conn:
-            cursor = conn.cursor()
-            cursor.execute("""
-                SELECT id, content, title, author, type
-                FROM Media
-                WHERE is_trash = 0  -- Exclude items marked as trash
-            """)
-            media_items = cursor.fetchall()
-            # Convert the results into a list of dictionaries
-            all_content = [
-                {
-                    'id': item[0],
-                    'content': item[1],
-                    'title': item[2],
-                    'author': item[3],
-                    'type': item[4]
-                }
-                for item in media_items
-            ]
-        return all_content
-    except sqlite3.Error as e:
-        logging.error(f"Error retrieving all content from database: {e}")
-        from App_Function_Libraries.DB.SQLite_DB import DatabaseError
-        raise DatabaseError(f"Error retrieving all content from database: {e}")
-def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
-    collection = chroma_client.get_or_create_collection(name=collection_name)
-    collection.add(
-        documents=texts,
-        embeddings=embeddings,
-        ids=ids,
-        metadatas=[{'source': source} for source in sources]
-    )
-def check_embedding_status(selected_item):
-    if not selected_item:
-        return "Please select an item", ""
-    item_id = selected_item.split('(')[0].strip()
-    collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
-    result = collection.get(ids=[f"doc_{item_id}"])
-    if result['ids']:
-        embedding = result['embeddings'][0]
-        embedding_preview = str(embedding[:50])  # Convert first 50 elements to string
-        return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
-    else:
-        return f"No embedding found for item: {item_id}", ""
-def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
-    if not selected_item:
-        return "Please select an item"
-    item_id = selected_item.split('(')[0].strip()
-    items = get_all_content_from_database()
-    item = next((item for item in items if item['title'] == item_id), None)
-    if not item:
-        return f"Item not found: {item_id}"
-    try:
-        if api_choice == "OpenAI":
-            embedding = create_embedding(item['content'])
-        else:  # Llama.cpp
-            embedding = create_embedding(item['content'])
-        collection_name = "all_content_embeddings"
-        store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
-        return f"New embedding created and stored for item: {item_id}"
-    except Exception as e:
-        return f"Error creating embedding: {str(e)}"
-#
-# End of Functions for ChromaDB
 #######################################################################################################################

+import configparser
+import logging
+import sqlite3
+from typing import List, Dict, Any
+import chromadb
+import requests
+from chromadb import Settings
+from App_Function_Libraries.Chunk_Lib import improved_chunking_process
+from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
+from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
+#######################################################################################################################
+#
+# Functions for ChromaDB
+# Get ChromaDB settings
+# Load configuration
+config = configparser.ConfigParser()
+config.read('config.txt')
+chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
+chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
+import os
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+# Get embedding settings
+embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
+embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
+embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
+embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
+# Get chunking options
+chunk_options = {
+    'method': config.get('Chunking', 'method', fallback='words'),
+    'max_size': config.getint('Chunking', 'max_size', fallback=400),
+    'overlap': config.getint('Chunking', 'overlap', fallback=200),
+    'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
+    'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
+    'language': config.get('Chunking', 'language', fallback='english')
+}
+def auto_update_chroma_embeddings(media_id: int, content: str):
+    """
+    Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
+    :param media_id: The ID of the newly ingested media item
+    :param content: The content of the newly ingested media item
+    """
+    collection_name = f"media_{media_id}"
+    # Initialize or get the ChromaDB collection
+    collection = chroma_client.get_or_create_collection(name=collection_name)
+    # Check if embeddings already exist for this media_id
+    existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
+    if existing_embeddings and len(existing_embeddings) > 0:
+        logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
+    else:
+        # Process and store content if embeddings do not already exist
+        process_and_store_content(content, collection_name, media_id)
+        logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
+# Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
+def process_and_store_content(content: str, collection_name: str, media_id: int):
+    # Process the content into chunks
+    chunks = improved_chunking_process(content, chunk_options)
+    texts = [chunk['text'] for chunk in chunks]
+    # Generate embeddings for each chunk
+    embeddings = [create_embedding(text) for text in texts]
+    # Create unique IDs for each chunk using the media_id and chunk index
+    ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
+    # Store the texts, embeddings, and IDs in ChromaDB
+    store_in_chroma(collection_name, texts, embeddings, ids)
+    # Store the chunk metadata in SQLite
+    for i, chunk in enumerate(chunks):
+        add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
+    # Update the FTS table
+    update_fts_for_media(media_id)
+# Function to store documents and their embeddings in ChromaDB
+def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
+    collection = chroma_client.get_or_create_collection(name=collection_name)
+    collection.add(
+        documents=texts,
+        embeddings=embeddings,
+        ids=ids
+    )
+# Function to perform vector search using ChromaDB
+def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
+    query_embedding = create_embedding(query)
+    collection = chroma_client.get_collection(name=collection_name)
+    results = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=k
+    )
+    return results['documents'][0]
+def create_embedding(text: str) -> List[float]:
+    global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
+    if embedding_provider == 'openai':
+        return get_openai_embeddings(text, embedding_model)
+    elif embedding_provider == 'local':
+        response = requests.post(
+            embedding_api_url,
+            json={"text": text, "model": embedding_model},
+            headers={"Authorization": f"Bearer {embedding_api_key}"}
+        )
+        return response.json()['embedding']
+    elif embedding_provider == 'huggingface':
+        from transformers import AutoTokenizer, AutoModel
+        import torch
+        tokenizer = AutoTokenizer.from_pretrained(embedding_model)
+        model = AutoModel.from_pretrained(embedding_model)
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Use the mean of the last hidden state as the sentence embedding
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        return embeddings[0].tolist()  # Convert to list for consistency
+    else:
+        raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
+def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
+    try:
+        all_content = get_all_content_from_database()
+        if not all_content:
+            return "No content found in the database."
+        texts_to_embed = []
+        embeddings_to_store = []
+        ids_to_store = []
+        collection_name = "all_content_embeddings"
+        # Initialize or get the ChromaDB collection
+        collection = chroma_client.get_or_create_collection(name=collection_name)
+        for content_item in all_content:
+            media_id = content_item['id']
+            text = content_item['content']
+            # Check if the embedding already exists in ChromaDB
+            embedding_exists = collection.get(ids=[f"doc_{media_id}"])
+            if embedding_exists:
+                logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
+                continue  # Skip if embedding already exists
+            # Create the embedding
+            if api_choice == "openai":
+                embedding = create_openai_embedding(text, model_or_url)
+            else:  # Llama.cpp
+                embedding = create_llamacpp_embedding(text, model_or_url)
+            # Collect the text, embedding, and ID for batch storage
+            texts_to_embed.append(text)
+            embeddings_to_store.append(embedding)
+            ids_to_store.append(f"doc_{media_id}")
+        # Store all new embeddings in ChromaDB
+        if texts_to_embed and embeddings_to_store:
+            store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
+        return "Embeddings created and stored successfully for all new content."
+    except Exception as e:
+        logging.error(f"Error during embedding creation: {str(e)}")
+        return f"Error: {str(e)}"
+def create_openai_embedding(text: str, model: str) -> List[float]:
+    openai_api_key = config['API']['openai_api_key']
+    embedding = get_openai_embeddings(text, model)
+    return embedding
+def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
+    response = requests.post(
+        api_url,
+        json={"input": text}
+    )
+    if response.status_code == 200:
+        return response.json()['embedding']
+    else:
+        raise Exception(f"Error from Llama.cpp API: {response.text}")
+def get_all_content_from_database() -> List[Dict[str, Any]]:
+    """
+    Retrieve all media content from the database that requires embedding.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
+    """
+    try:
+        from App_Function_Libraries.DB.DB_Manager import db
+        with db.get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT id, content, title, author, type
+                FROM Media
+                WHERE is_trash = 0  -- Exclude items marked as trash
+            """)
+            media_items = cursor.fetchall()
+            # Convert the results into a list of dictionaries
+            all_content = [
+                {
+                    'id': item[0],
+                    'content': item[1],
+                    'title': item[2],
+                    'author': item[3],
+                    'type': item[4]
+                }
+                for item in media_items
+            ]
+        return all_content
+    except sqlite3.Error as e:
+        logging.error(f"Error retrieving all content from database: {e}")
+        from App_Function_Libraries.DB.SQLite_DB import DatabaseError
+        raise DatabaseError(f"Error retrieving all content from database: {e}")
+def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
+    collection = chroma_client.get_or_create_collection(name=collection_name)
+    collection.add(
+        documents=texts,
+        embeddings=embeddings,
+        ids=ids,
+        metadatas=[{'source': source} for source in sources]
+    )
+def check_embedding_status(selected_item):
+    if not selected_item:
+        return "Please select an item", ""
+    item_id = selected_item.split('(')[0].strip()
+    collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
+    result = collection.get(ids=[f"doc_{item_id}"])
+    if result['ids']:
+        embedding = result['embeddings'][0]
+        embedding_preview = str(embedding[:50])  # Convert first 50 elements to string
+        return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
+    else:
+        return f"No embedding found for item: {item_id}", ""
+def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
+    if not selected_item:
+        return "Please select an item"
+    item_id = selected_item.split('(')[0].strip()
+    items = get_all_content_from_database()
+    item = next((item for item in items if item['title'] == item_id), None)
+    if not item:
+        return f"Item not found: {item_id}"
+    try:
+        if api_choice == "OpenAI":
+            embedding = create_embedding(item['content'])
+        else:  # Llama.cpp
+            embedding = create_embedding(item['content'])
+        collection_name = "all_content_embeddings"
+        store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
+        return f"New embedding created and stored for item: {item_id}"
+    except Exception as e:
+        return f"Error creating embedding: {str(e)}"
+#
+# End of Functions for ChromaDB
 #######################################################################################################################