Chat-Bot / vectorize.py
itachi-ai's picture
update
80871cf verified
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from embed_with_db import embeddings, config, client
from tqdm import tqdm
class VectorDataBase():
def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
self.file_path = file_path
self.file_type= file_type
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
self.db_collection = client[config['DB_NAME']][db_collection]
self.start_page = int(page_start)
def load_docs_split(self):
if str(self.file_type).lower() == 'pdf':
loader = PyPDFLoader(self.file_path)
elif str(self.file_type).lower() == 'text':
loader = TextLoader(self.file_path)
else:
loader = None
if loader:
docs = loader.load()
return self.text_splitter.split_documents(docs)
else:
return self.text_splitter.create_documents([self.file_path])
def docs_embeddings(self):
texts = self.load_docs_split()
if texts:
docsearch = MongoDBAtlasVectorSearch.from_documents(
texts,
embeddings,
collection=self.db_collection,
index_name=config['VECTOR_SEARCH_INDEX'])
print('done!')
return docsearch
else:
print('documents is not embedded')
return 'Some issues'
def add_collection_database(self,doc):
self.db_collection.insert_one(
{
'text': doc.page_content,
'embedding': embeddings.embed_query(doc.page_content),
'source': doc.metadata.get('source', 'Unknown'),
'page': doc.metadata.get('page', 0)
}
)
def embedding_with_loop(self):
docs = self.load_docs_split()
if docs:
for doc in tqdm(docs[self.start_page:]):
self.add_collection_database(doc)
print('Done')
else:
raise Exception('Some issue with it')