|
from langchain_mongodb import MongoDBAtlasVectorSearch |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyPDFLoader, TextLoader |
|
from embed_with_db import embeddings, config, client |
|
from tqdm import tqdm |
|
|
|
class VectorDataBase(): |
|
def __init__(self, file_path, db_collection, file_type='pdf', page_start=0): |
|
self.file_path = file_path |
|
self.file_type= file_type |
|
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32) |
|
self.db_collection = client[config['DB_NAME']][db_collection] |
|
self.start_page = int(page_start) |
|
def load_docs_split(self): |
|
if str(self.file_type).lower() == 'pdf': |
|
loader = PyPDFLoader(self.file_path) |
|
elif str(self.file_type).lower() == 'text': |
|
loader = TextLoader(self.file_path) |
|
else: |
|
loader = None |
|
if loader: |
|
docs = loader.load() |
|
return self.text_splitter.split_documents(docs) |
|
else: |
|
return self.text_splitter.create_documents([self.file_path]) |
|
|
|
def docs_embeddings(self): |
|
texts = self.load_docs_split() |
|
if texts: |
|
docsearch = MongoDBAtlasVectorSearch.from_documents( |
|
texts, |
|
embeddings, |
|
collection=self.db_collection, |
|
index_name=config['VECTOR_SEARCH_INDEX']) |
|
print('done!') |
|
return docsearch |
|
else: |
|
print('documents is not embedded') |
|
return 'Some issues' |
|
def add_collection_database(self,doc): |
|
self.db_collection.insert_one( |
|
{ |
|
'text': doc.page_content, |
|
'embedding': embeddings.embed_query(doc.page_content), |
|
'source': doc.metadata.get('source', 'Unknown'), |
|
'page': doc.metadata.get('page', 0) |
|
} |
|
) |
|
def embedding_with_loop(self): |
|
docs = self.load_docs_split() |
|
if docs: |
|
for doc in tqdm(docs[self.start_page:]): |
|
self.add_collection_database(doc) |
|
print('Done') |
|
else: |
|
raise Exception('Some issue with it') |
|
|
|
|