SaiChaitanya's picture
Upload 106 files
25773cf verified
import bs4
import pandas as pd
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
#from langchain.document_loaders import PyPDFLoader, CSVLoader, ExcelLoader
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
#from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
import sys
import shutil
import os
import uuid
import csv
def from_web(url):
loader = WebBaseLoader(web_paths=(url,),
bs_kwargs=dict(parse_only=bs4.SoupStrainer(
class_=("post-content", "post-title", "post-header")
)),)
docs = loader.load()
return docs
def from_excel(file_address):
if file_address.endswith(".xlsx"):
loader = UnstructuredExcelLoader(file_path=file_address)
docs = loader.load()
return docs
else:
docs = []
for file_name in os.listdir(file_address):
file_path = os.path.join(file_address, file_name)
if os.path.isfile(file_path) and file_name.endswith(".xlsx"):
# Load the Excel file
loader = UnstructuredExcelLoader(file_path=file_address)
docs.extend(loader.load())
return docs
def from_csv(file_address):
docs = []
#Load the CSV file
if file_address.endswith(".csv"):
loader = CSVLoader(file_path=file_address, encoding='utf-8')
docs = loader.load()
return docs
def from_pdf(file_address):
loader = PyPDFLoader(file_path=file_address)
docs = loader.load()
return docs
def from_text_files(file_address):
docs = []
for file_name in os.listdir(file_address):
file_path = os.path.join(file_address, file_name)
if os.path.isfile(file_path) and file_name.endswith(".txt"):
loader = TextLoader(file_path)
docs.extend(loader.load())
return docs
def retriever_from_docs(docs):
if not docs:
print("No documents to process.")
return
#print("Documents:", docs)
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print(f"Number of document chunks: {len(splits)}")
# Create embeddings for the document chunks
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # 384 dimensionality embeddings
embeddings = OpenAIEmbeddings() # 1536 dimensionality
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # 768 embedded dimension
#embeddings = HuggingFaceEmbeddings(model_name="bert-large-uncased") # 1024 dim
embeddings_list = embeddings.embed_documents([t.page_content for t in splits])
# Generate unique IDs for each document chunk
doc_ids = [str(uuid.uuid4()) for _ in range(len(splits))]
print(f"Number of IDs generated: {len(doc_ids)}")
# Create or load the Chroma vector store
persist_directory="../../chroma_db"
# Check if the directory exists
if os.path.exists(persist_directory):
# Remove the directory and its contents
#shutil.rmtree(persist_directory)
#print(f"Deleted {persist_directory}")
# Load the existing vector store
#chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
#print()
# Create a new vector store
chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
persist_directory=persist_directory)
# Load the existing vector store
chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
else:
print(f"{persist_directory} does not exist")
# Create a new vector store
chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
persist_directory=persist_directory)
#Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="../../chroma_db")
#chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="../../chroma_db")
# Is used to add new documents and their corresponding embeddings to an existing Chroma vector store.
#chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
print("Embeddings are added to vector store.")
def main():
print(sys.argv)
load_dotenv()
#file_address = "../../../db_28_2_text/db_28_2_text/"
#file_address = "../../../db_28_2_excel/db_28_2_excel/"
file_address = "../../../International Job Dataset/allJobs.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Technology Skills.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Tools Used.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Alternate Titles.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Emerging Tasks.xlsx" Job Zone Reference
#file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zone Reference.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zones.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Occupation Data.xlsx"
#file_address = "../../../db_28_2_excel/db_28_2_excel/Related Occupations.xlsx"
# Check if the file_address exists
if not os.path.exists(file_address):
print("File address does not exist.")
return
# Determine the input type and load the documents accordingly
if 'http' in sys.argv[1].lower():
retriever_from_docs(from_web(sys.argv[1]))
elif '.xls' in sys.argv[1].lower():
retriever_from_docs(from_excel(sys.argv[1]))
elif '.csv' in sys.argv[1].lower():
retriever_from_docs(from_csv(sys.argv[1]))
elif '.pdf' in sys.argv[1].lower():
retriever_from_docs(from_pdf(sys.argv[1]))
elif '.txt' in sys.argv[1].lower():
retriever_from_docs(from_text_files(sys.argv[1]))
elif 'excel' in sys.argv[1].lower():
retriever_from_docs(from_excel(sys.argv[1]))
elif 'text' in sys.argv[1].lower():
retriever_from_docs(from_text_files(sys.argv[1]))
else:
print(f"Unsupported file format for file.")
if __name__ == "__main__":
main()