from langchain_mongodb import MongoDBAtlasVectorSearch from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader, TextLoader from embed_with_db import embeddings, config, client from tqdm import tqdm class VectorDataBase(): def __init__(self, file_path, db_collection, file_type='pdf', page_start=0): self.file_path = file_path self.file_type= file_type self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32) self.db_collection = client[config['DB_NAME']][db_collection] self.start_page = int(page_start) def load_docs_split(self): if str(self.file_type).lower() == 'pdf': loader = PyPDFLoader(self.file_path) elif str(self.file_type).lower() == 'text': loader = TextLoader(self.file_path) else: loader = None if loader: docs = loader.load() return self.text_splitter.split_documents(docs) else: return self.text_splitter.create_documents([self.file_path]) def docs_embeddings(self): texts = self.load_docs_split() if texts: docsearch = MongoDBAtlasVectorSearch.from_documents( texts, embeddings, collection=self.db_collection, index_name=config['VECTOR_SEARCH_INDEX']) print('done!') return docsearch else: print('documents is not embedded') return 'Some issues' def add_collection_database(self,doc): self.db_collection.insert_one( { 'text': doc.page_content, 'embedding': embeddings.embed_query(doc.page_content), 'source': doc.metadata.get('source', 'Unknown'), 'page': doc.metadata.get('page', 0) } ) def embedding_with_loop(self): docs = self.load_docs_split() if docs: for doc in tqdm(docs[self.start_page:]): self.add_collection_database(doc) print('Done') else: raise Exception('Some issue with it')