|
|
|
|
|
|
|
|
|
import io
|
|
from typing import List
|
|
|
|
|
|
from chromadb import Documents, EmbeddingFunction, Embeddings
|
|
from chromadb.utils import embedding_functions
|
|
from chunking_evaluation import BaseChunker, rigorous_document_search
|
|
from chunking_evaluation import BaseChunker, GeneralEvaluation
|
|
from chunking_evaluation.evaluation_framework.base_evaluation import BaseEvaluation
|
|
|
|
|
|
|
|
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
|
from App_Function_Libraries.RAG.ChromaDB_Library import embedding_model, embedding_api_url
|
|
from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch, embedding_provider
|
|
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
|
|
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
|
|
|
def detect_file_encoding(file_path):
|
|
with open(file_path, 'rb') as file:
|
|
raw_data = file.read()
|
|
print(chardet.detect(raw_data)['encoding'])
|
|
return chardet.detect(raw_data)['encoding']
|
|
|
|
|
|
class CustomEmbeddingFunction(EmbeddingFunction):
|
|
def __call__(self, input: Documents) -> Embeddings:
|
|
|
|
config = load_comprehensive_config()
|
|
embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='openai')
|
|
embedding_model = config.get('Embeddings', 'embedding_model', fallback='text-embedding-3-small')
|
|
embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
|
|
|
|
|
|
embeddings = create_embeddings_batch(input, embedding_provider, embedding_model, embedding_api_url)
|
|
return embeddings
|
|
|
|
|
|
class CustomChunker(BaseChunker):
|
|
def __init__(self, chunk_options):
|
|
self.chunk_options = chunk_options
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
|
|
chunks = improved_chunking_process(text, self.chunk_options)
|
|
return [chunk['text'] for chunk in chunks]
|
|
|
|
def read_file(self, file_path: str) -> str:
|
|
encoding = detect_file_encoding(file_path)
|
|
with open(file_path, 'r', encoding=encoding) as file:
|
|
return file.read()
|
|
|
|
def utf8_file_reader(file_path):
|
|
with io.open(file_path, 'r', encoding='utf-8') as file:
|
|
return file.read()
|
|
|
|
|
|
class CustomEvaluation(BaseEvaluation):
|
|
def _get_chunks_and_metadata(self, splitter):
|
|
documents = []
|
|
metadatas = []
|
|
for corpus_id in self.corpus_list:
|
|
corpus_path = corpus_id
|
|
if self.corpora_id_paths is not None:
|
|
corpus_path = self.corpora_id_paths[corpus_id]
|
|
|
|
corpus = splitter.read_file(corpus_path)
|
|
|
|
current_documents = splitter.split_text(corpus)
|
|
current_metadatas = []
|
|
for document in current_documents:
|
|
try:
|
|
_, start_index, end_index = rigorous_document_search(corpus, document)
|
|
except:
|
|
print(f"Error in finding {document} in {corpus_id}")
|
|
raise Exception(f"Error in finding {document} in {corpus_id}")
|
|
current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
|
|
documents.extend(current_documents)
|
|
metadatas.extend(current_metadatas)
|
|
return documents, metadatas
|
|
|
|
|
|
|
|
chunk_options = {
|
|
'method': 'words',
|
|
'max_size': 400,
|
|
'overlap': 200,
|
|
'adaptive': False,
|
|
'multi_level': False,
|
|
'language': 'english'
|
|
}
|
|
custom_chunker = CustomChunker(chunk_options)
|
|
|
|
|
|
custom_ef = CustomEmbeddingFunction()
|
|
|
|
|
|
|
|
|
|
|
|
evaluation = GeneralEvaluation()
|
|
import chardet
|
|
|
|
def smart_file_reader(file_path):
|
|
encoding = detect_file_encoding(file_path)
|
|
with io.open(file_path, 'r', encoding=encoding) as file:
|
|
return file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedding_results = evaluation.run(custom_chunker, custom_ef)
|
|
print(f"Embedding Results:\n\t{embedding_results}")
|
|
|
|
|
|
chunk_results = evaluation.run(custom_chunker, custom_ef)
|
|
print(f"Chunking Results:\n\t{chunk_results}")
|
|
|
|
|
|
|
|
|
|
|