Spaces:

itachi-ai
/

Chat-Bot

Sleeping

App Files Files Community

Chat-Bot / vectorize.py

itachi-ai

update

80871cf verified 8 months ago

raw

history blame contribute delete

2.37 kB

	from langchain_mongodb import MongoDBAtlasVectorSearch
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader, TextLoader
	from embed_with_db import embeddings, config, client
	from tqdm import tqdm

	class VectorDataBase():
	def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
	self.file_path = file_path
	self.file_type= file_type
	self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
	self.db_collection = client[config['DB_NAME']][db_collection]
	self.start_page = int(page_start)
	def load_docs_split(self):
	if str(self.file_type).lower() == 'pdf':
	loader = PyPDFLoader(self.file_path)
	elif str(self.file_type).lower() == 'text':
	loader = TextLoader(self.file_path)
	else:
	loader = None
	if loader:
	docs = loader.load()
	return self.text_splitter.split_documents(docs)
	else:
	return self.text_splitter.create_documents([self.file_path])

	def docs_embeddings(self):
	texts = self.load_docs_split()
	if texts:
	docsearch = MongoDBAtlasVectorSearch.from_documents(
	texts,
	embeddings,
	collection=self.db_collection,
	index_name=config['VECTOR_SEARCH_INDEX'])
	print('done!')
	return docsearch
	else:
	print('documents is not embedded')
	return 'Some issues'
	def add_collection_database(self,doc):
	self.db_collection.insert_one(
	{
	'text': doc.page_content,
	'embedding': embeddings.embed_query(doc.page_content),
	'source': doc.metadata.get('source', 'Unknown'),
	'page': doc.metadata.get('page', 0)
	}
	)
	def embedding_with_loop(self):
	docs = self.load_docs_split()
	if docs:
	for doc in tqdm(docs[self.start_page:]):
	self.add_collection_database(doc)
	print('Done')
	else:
	raise Exception('Some issue with it')