Spaces:

oceansweep
/

tldw

Running

File size: 15,486 Bytes

import numpy as np
from typing import List, Tuple, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import math
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import openai
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import re
import psycopg2
from psycopg2.extras import execute_values
import sqlite3
import logging



########################################################################################################################################################################################################################################
#
# RAG Chunking
# To fully integrate this chunking system, you'd need to:
#
# Create the UnvectorizedMediaChunks table in your SQLite database.
# Modify your document ingestion process to use chunk_and_store_unvectorized.
# Implement a background process that periodically calls vectorize_all_documents to process unvectorized chunks.

# This chunking is pretty weak and needs improvement
# See notes for improvements #FIXME
import json
from typing import List, Dict, Any
from datetime import datetime


def chunk_and_store_unvectorized(

        db_connection,

        media_id: int,

        text: str,

        chunk_size: int = 1000,

        overlap: int = 100,

        chunk_type: str = 'fixed-length'

) -> List[int]:
    chunks = create_chunks(text, chunk_size, overlap)
    return store_unvectorized_chunks(db_connection, media_id, chunks, chunk_type)


def create_chunks(text: str, chunk_size: int, overlap: int) -> List[Dict[str, Any]]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk_text = ' '.join(words[i:i + chunk_size])
        start_char = text.index(words[i])
        end_char = start_char + len(chunk_text)
        chunks.append({
            'text': chunk_text,
            'start_char': start_char,
            'end_char': end_char,
            'index': len(chunks)
        })
    return chunks


def store_unvectorized_chunks(

        db_connection,

        media_id: int,

        chunks: List[Dict[str, Any]],

        chunk_type: str

) -> List[int]:
    cursor = db_connection.cursor()
    chunk_ids = []
    for chunk in chunks:
        cursor.execute("""

            INSERT INTO UnvectorizedMediaChunks 

            (media_id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata)

            VALUES (?, ?, ?, ?, ?, ?, ?)

        """, (
            media_id,
            chunk['text'],
            chunk['index'],
            chunk['start_char'],
            chunk['end_char'],
            chunk_type,
            json.dumps({'length': len(chunk['text'])})  # Example metadata
        ))
        chunk_ids.append(cursor.lastrowid)
    db_connection.commit()
    return chunk_ids


def get_unvectorized_chunks(

        db_connection,

        media_id: int,

        limit: int = 100,

        offset: int = 0

) -> List[Dict[str, Any]]:
    cursor = db_connection.cursor()
    cursor.execute("""

        SELECT id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata

        FROM UnvectorizedMediaChunks

        WHERE media_id = ? AND is_processed = FALSE

        ORDER BY chunk_index

        LIMIT ? OFFSET ?

    """, (media_id, limit, offset))
    return [
        {
            'id': row[0],
            'text': row[1],
            'index': row[2],
            'start_char': row[3],
            'end_char': row[4],
            'type': row[5],
            'metadata': json.loads(row[6])
        }
        for row in cursor.fetchall()
    ]


def mark_chunks_as_processed(db_connection, chunk_ids: List[int]):
    cursor = db_connection.cursor()
    cursor.executemany("""

        UPDATE UnvectorizedMediaChunks

        SET is_processed = TRUE, last_modified = ?

        WHERE id = ?

    """, [(datetime.now(), chunk_id) for chunk_id in chunk_ids])
    db_connection.commit()


# Usage example
def process_media_chunks(db_connection, media_id: int, text: str):
    chunk_ids = chunk_and_store_unvectorized(db_connection, media_id, text)
    print(f"Stored {len(chunk_ids)} unvectorized chunks for media_id {media_id}")

    # Later, when you want to process these chunks:
    unprocessed_chunks = get_unvectorized_chunks(db_connection, media_id)
    # Process chunks (e.g., vectorize them)
    # ...
    # After processing, mark them as processed
    mark_chunks_as_processed(db_connection, [chunk['id'] for chunk in unprocessed_chunks])
###########################################################################################################################################################################################################
#
# RAG System

# To use this updated RAG system in your existing application:
#
# Install required packages:
# pip install sentence-transformers psycopg2-binary scikit-learn transformers torch
# Set up PostgreSQL with pgvector:
#
# Install PostgreSQL and the pgvector extension.
# Create a new database for vector storage.
#
# Update your main application to use the RAG system:
#
# Import the RAGSystem class from this new file.
# Initialize the RAG system with your SQLite and PostgreSQL configurations.
# Use the vectorize_all_documents method to initially vectorize your existing documents.
#
#
# Modify your existing PDF_Ingestion_Lib.py and Book_Ingestion_Lib.py:
#
# After successfully ingesting a document into SQLite, call the vectorization method from the RAG system.

# Example modification for ingest_text_file in Book_Ingestion_Lib.py:
# from RAG_Library import RAGSystem
#
# # Initialize RAG system (do this once in your main application)
# rag_system = RAGSystem(sqlite_path, pg_config)
#
# def ingest_text_file(file_path, title=None, author=None, keywords=None):
#     try:
#         # ... (existing code)
#
#         # Add the text file to the database
#         doc_id = add_media_with_keywords(
#             url=file_path,
#             title=title,
#             media_type='document',
#             content=content,
#             keywords=keywords,
#             prompt='No prompt for text files',
#             summary='No summary for text files',
#             transcription_model='None',
#             author=author,
#             ingestion_date=datetime.now().strftime('%Y-%m-%d')
#         )
#
#         # Vectorize the newly added document
#         rag_system.vectorize_document(doc_id, content)
#
#         return f"Text file '{title}' by {author} ingested and vectorized successfully."
#     except Exception as e:
#         logging.error(f"Error ingesting text file: {str(e)}")
#         return f"Error ingesting text file: {str(e)}"



# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
VECTOR_DIM = 384  # Dimension of the chosen embedding model


class RAGSystem:
    def __init__(self, sqlite_path: str, pg_config: Dict[str, str], cache_size: int = 100):
        self.sqlite_path = sqlite_path
        self.pg_config = pg_config
        self.model = SentenceTransformer(EMBEDDING_MODEL)
        self.cache_size = cache_size

        self._init_postgres()

    def _init_postgres(self):
        with psycopg2.connect(**self.pg_config) as conn:
            with conn.cursor() as cur:
                cur.execute("""

                CREATE TABLE IF NOT EXISTS document_vectors (

                    id SERIAL PRIMARY KEY,

                    document_id INTEGER UNIQUE,

                    vector vector(384)

                )

                """)
            conn.commit()

    @lru_cache(maxsize=100)
    def _get_embedding(self, text: str) -> np.ndarray:
        return self.model.encode([text])[0]

    def vectorize_document(self, doc_id: int, content: str):
        chunks = create_chunks(content, chunk_size=1000, overlap=100)
        for chunk in chunks:
            vector = self._get_embedding(chunk['text'])

            with psycopg2.connect(**self.pg_config) as conn:
                with conn.cursor() as cur:
                    cur.execute("""

                    INSERT INTO document_vectors (document_id, chunk_index, vector, metadata)

                    VALUES (%s, %s, %s, %s)

                    ON CONFLICT (document_id, chunk_index) DO UPDATE SET vector = EXCLUDED.vector

                    """, (doc_id, chunk['index'], vector.tolist(), json.dumps(chunk)))
                conn.commit()

    def vectorize_all_documents(self):
        with sqlite3.connect(self.sqlite_path) as sqlite_conn:
            unprocessed_chunks = get_unvectorized_chunks(sqlite_conn, limit=1000)
            for chunk in unprocessed_chunks:
                self.vectorize_document(chunk['id'], chunk['text'])
            mark_chunks_as_processed(sqlite_conn, [chunk['id'] for chunk in unprocessed_chunks])

    def semantic_search(self, query: str, top_k: int = 5) -> List[Tuple[int, int, float]]:
        query_vector = self._get_embedding(query)

        with psycopg2.connect(**self.pg_config) as conn:
            with conn.cursor() as cur:
                cur.execute("""

                SELECT document_id, chunk_index, 1 - (vector <-> %s) AS similarity

                FROM document_vectors

                ORDER BY vector <-> %s ASC

                LIMIT %s

                """, (query_vector.tolist(), query_vector.tolist(), top_k))
                results = cur.fetchall()

        return results

    def get_document_content(self, doc_id: int) -> str:
        with sqlite3.connect(self.sqlite_path) as conn:
            cur = conn.cursor()
            cur.execute("SELECT content FROM media WHERE id = ?", (doc_id,))
            result = cur.fetchone()
            return result[0] if result else ""

    def bm25_search(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
        with sqlite3.connect(self.sqlite_path) as conn:
            cur = conn.cursor()
            cur.execute("SELECT id, content FROM media")
            documents = cur.fetchall()

        vectorizer = TfidfVectorizer(use_idf=True)
        tfidf_matrix = vectorizer.fit_transform([doc[1] for doc in documents])

        query_vector = vectorizer.transform([query])
        doc_lengths = tfidf_matrix.sum(axis=1).A1
        avg_doc_length = np.mean(doc_lengths)

        k1, b = 1.5, 0.75
        scores = []
        for i, doc_vector in enumerate(tfidf_matrix):
            score = np.sum(
                ((k1 + 1) * query_vector.multiply(doc_vector)).A1 /
                (k1 * (1 - b + b * doc_lengths[i] / avg_doc_length) + query_vector.multiply(doc_vector).A1)
            )
            scores.append((documents[i][0], score))

        return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

    def combine_search_results(self, bm25_results: List[Tuple[int, float]], vector_results: List[Tuple[int, float]],

                               alpha: float = 0.5) -> List[Tuple[int, float]]:
        combined_scores = {}
        for idx, score in bm25_results + vector_results:
            if idx in combined_scores:
                combined_scores[idx] += score * (alpha if idx in dict(bm25_results) else (1 - alpha))
            else:
                combined_scores[idx] = score * (alpha if idx in dict(bm25_results) else (1 - alpha))
        return sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    def expand_query(self, query: str) -> str:
        model = T5ForConditionalGeneration.from_pretrained("t5-small")
        tokenizer = T5Tokenizer.from_pretrained("t5-small")

        input_text = f"expand query: {query}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt")

        outputs = model.generate(input_ids, max_length=50, num_return_sequences=1)
        expanded_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return f"{query} {expanded_query}"

    def cross_encoder_rerank(self, query: str, initial_results: List[Tuple[int, float]], top_k: int = 5) -> List[
        Tuple[int, float]]:
        from sentence_transformers import CrossEncoder
        model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

        candidate_docs = [self.get_document_content(doc_id) for doc_id, _ in initial_results[:top_k * 2]]
        pairs = [[query, doc] for doc in candidate_docs]
        scores = model.predict(pairs)

        reranked = sorted(zip(initial_results[:top_k * 2], scores), key=lambda x: x[1], reverse=True)
        return [(idx, score) for (idx, _), score in reranked[:top_k]]

    def rag_query(self, query: str, search_type: str = 'combined', top_k: int = 5, use_hyde: bool = False,

                  rerank: bool = False, expand: bool = False) -> List[Dict[str, any]]:
        try:
            if expand:
                query = self.expand_query(query)

            if use_hyde:
                # Implement HyDE if needed
                pass
            elif search_type == 'vector':
                results = self.semantic_search(query, top_k)
            elif search_type == 'bm25':
                results = self.bm25_search(query, top_k)
            elif search_type == 'combined':
                bm25_results = self.bm25_search(query, top_k)
                vector_results = self.semantic_search(query, top_k)
                results = self.combine_search_results(bm25_results, vector_results)
            else:
                raise ValueError("Invalid search type. Choose 'vector', 'bm25', or 'combined'.")

            if rerank:
                results = self.cross_encoder_rerank(query, results, top_k)

            enriched_results = []
            for doc_id, score in results:
                content = self.get_document_content(doc_id)
                enriched_results.append({
                    "document_id": doc_id,
                    "score": score,
                    "content": content[:500]  # Truncate content for brevity
                })

            return enriched_results
        except Exception as e:
            logger.error(f"An error occurred during RAG query: {str(e)}")
            return []


# Example usage
if __name__ == "__main__":
    sqlite_path = "path/to/your/sqlite/database.db"
    pg_config = {
        "dbname": "your_db_name",
        "user": "your_username",
        "password": "your_password",
        "host": "localhost"
    }

    rag_system = RAGSystem(sqlite_path, pg_config)

    # Vectorize all documents (run this once or periodically)
    rag_system.vectorize_all_documents()

    # Example query
    query = "programming concepts for beginners"
    results = rag_system.rag_query(query, search_type='combined', expand=True, rerank=True)

    print(f"Search results for query: '{query}'\n")
    for i, result in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"Document ID: {result['document_id']}")
        print(f"Score: {result['score']:.4f}")
        print(f"Content snippet: {result['content']}")
        print("---")