Open_NotebookLM_TLDW

Paused

File size: 4,871 Bytes

43cd37c

# eval_Chroma_Embeddings.py
# Description: This script is used to evaluate the embeddings and chunking process for the ChromaDB model.
#
# Imports
import io
from typing import List
#
# External Imports
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils import embedding_functions
from chunking_evaluation import BaseChunker, rigorous_document_search
from chunking_evaluation import BaseChunker, GeneralEvaluation
from chunking_evaluation.evaluation_framework.base_evaluation import BaseEvaluation

#
# Local Imports
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
from App_Function_Libraries.RAG.ChromaDB_Library import embedding_model, embedding_api_url
from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch, embedding_provider
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
#
########################################################################################################################
#
# Functions:
import chardet
# FIXME


def detect_file_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        print(chardet.detect(raw_data)['encoding'])
    return chardet.detect(raw_data)['encoding']


class CustomEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        # Load config here
        config = load_comprehensive_config()
        embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='openai')
        embedding_model = config.get('Embeddings', 'embedding_model', fallback='text-embedding-3-small')
        embedding_api_url = config.get('Embeddings', 'api_url', fallback='')

        # Use your existing create_embeddings_batch function
        embeddings = create_embeddings_batch(input, embedding_provider, embedding_model, embedding_api_url)
        return embeddings


class CustomChunker(BaseChunker):
    def __init__(self, chunk_options):
        self.chunk_options = chunk_options

    def split_text(self, text: str) -> List[str]:
        # Use your existing improved_chunking_process function
        chunks = improved_chunking_process(text, self.chunk_options)
        return [chunk['text'] for chunk in chunks]

    def read_file(self, file_path: str) -> str:
        encoding = detect_file_encoding(file_path)
        with open(file_path, 'r', encoding=encoding) as file:
            return file.read()

def utf8_file_reader(file_path):
    with io.open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


class CustomEvaluation(BaseEvaluation):
    def _get_chunks_and_metadata(self, splitter):
        documents = []
        metadatas = []
        for corpus_id in self.corpus_list:
            corpus_path = corpus_id
            if self.corpora_id_paths is not None:
                corpus_path = self.corpora_id_paths[corpus_id]

            corpus = splitter.read_file(corpus_path)

            current_documents = splitter.split_text(corpus)
            current_metadatas = []
            for document in current_documents:
                try:
                    _, start_index, end_index = rigorous_document_search(corpus, document)
                except:
                    print(f"Error in finding {document} in {corpus_id}")
                    raise Exception(f"Error in finding {document} in {corpus_id}")
                current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
            documents.extend(current_documents)
            metadatas.extend(current_metadatas)
        return documents, metadatas


# Instantiate your custom chunker
chunk_options = {
    'method': 'words',
    'max_size': 400,
    'overlap': 200,
    'adaptive': False,
    'multi_level': False,
    'language': 'english'
}
custom_chunker = CustomChunker(chunk_options)

# Instantiate your custom embedding function
custom_ef = CustomEmbeddingFunction()


# Evaluate the embedding function

# Evaluate the chunker
evaluation = GeneralEvaluation()
import chardet

def smart_file_reader(file_path):
    encoding = detect_file_encoding(file_path)
    with io.open(file_path, 'r', encoding=encoding) as file:
        return file.read()

# Set the custom file reader
#evaluation._file_reader = smart_file_reader


# Generate Embedding results
embedding_results = evaluation.run(custom_chunker, custom_ef)
print(f"Embedding Results:\n\t{embedding_results}")

# Generate Chunking results
chunk_results = evaluation.run(custom_chunker, custom_ef)
print(f"Chunking Results:\n\t{chunk_results}")

#
# End of File
########################################################################################################################