|
import bs4 |
|
import pandas as pd |
|
from langchain import hub |
|
from langchain_community.document_loaders import WebBaseLoader |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.runnables import RunnablePassthrough |
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader |
|
from langchain_community.document_loaders import UnstructuredExcelLoader |
|
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from dotenv import load_dotenv |
|
import sys |
|
import shutil |
|
import os |
|
import uuid |
|
import csv |
|
|
|
def from_web(url): |
|
loader = WebBaseLoader(web_paths=(url,), |
|
bs_kwargs=dict(parse_only=bs4.SoupStrainer( |
|
class_=("post-content", "post-title", "post-header") |
|
)),) |
|
docs = loader.load() |
|
return docs |
|
|
|
def from_excel(file_address): |
|
if file_address.endswith(".xlsx"): |
|
loader = UnstructuredExcelLoader(file_path=file_address) |
|
docs = loader.load() |
|
return docs |
|
else: |
|
docs = [] |
|
for file_name in os.listdir(file_address): |
|
file_path = os.path.join(file_address, file_name) |
|
if os.path.isfile(file_path) and file_name.endswith(".xlsx"): |
|
|
|
loader = UnstructuredExcelLoader(file_path=file_address) |
|
docs.extend(loader.load()) |
|
return docs |
|
|
|
def from_csv(file_address): |
|
docs = [] |
|
|
|
if file_address.endswith(".csv"): |
|
loader = CSVLoader(file_path=file_address, encoding='utf-8') |
|
docs = loader.load() |
|
return docs |
|
|
|
def from_pdf(file_address): |
|
loader = PyPDFLoader(file_path=file_address) |
|
docs = loader.load() |
|
return docs |
|
|
|
def from_text_files(file_address): |
|
docs = [] |
|
for file_name in os.listdir(file_address): |
|
file_path = os.path.join(file_address, file_name) |
|
if os.path.isfile(file_path) and file_name.endswith(".txt"): |
|
loader = TextLoader(file_path) |
|
docs.extend(loader.load()) |
|
return docs |
|
|
|
def retriever_from_docs(docs): |
|
if not docs: |
|
print("No documents to process.") |
|
return |
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
splits = text_splitter.split_documents(docs) |
|
print(f"Number of document chunks: {len(splits)}") |
|
|
|
|
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
|
|
|
|
embeddings_list = embeddings.embed_documents([t.page_content for t in splits]) |
|
|
|
|
|
doc_ids = [str(uuid.uuid4()) for _ in range(len(splits))] |
|
print(f"Number of IDs generated: {len(doc_ids)}") |
|
|
|
|
|
persist_directory="../../chroma_db" |
|
|
|
|
|
if os.path.exists(persist_directory): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, |
|
persist_directory=persist_directory) |
|
|
|
|
|
chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) |
|
|
|
chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids) |
|
|
|
|
|
else: |
|
print(f"{persist_directory} does not exist") |
|
|
|
chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, |
|
persist_directory=persist_directory) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Embeddings are added to vector store.") |
|
|
|
|
|
def main(): |
|
print(sys.argv) |
|
load_dotenv() |
|
|
|
|
|
|
|
file_address = "../../../International Job Dataset/allJobs.xlsx" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(file_address): |
|
print("File address does not exist.") |
|
return |
|
|
|
|
|
if 'http' in sys.argv[1].lower(): |
|
retriever_from_docs(from_web(sys.argv[1])) |
|
elif '.xls' in sys.argv[1].lower(): |
|
retriever_from_docs(from_excel(sys.argv[1])) |
|
elif '.csv' in sys.argv[1].lower(): |
|
retriever_from_docs(from_csv(sys.argv[1])) |
|
elif '.pdf' in sys.argv[1].lower(): |
|
retriever_from_docs(from_pdf(sys.argv[1])) |
|
elif '.txt' in sys.argv[1].lower(): |
|
retriever_from_docs(from_text_files(sys.argv[1])) |
|
elif 'excel' in sys.argv[1].lower(): |
|
retriever_from_docs(from_excel(sys.argv[1])) |
|
elif 'text' in sys.argv[1].lower(): |
|
retriever_from_docs(from_text_files(sys.argv[1])) |
|
else: |
|
print(f"Unsupported file format for file.") |
|
|
|
if __name__ == "__main__": |
|
main() |