from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import os
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import TextLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

# Import CSV Files to the VectorDB
# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0

# df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
# df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
# df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")

# Get the directory path of the current script
script_dir = os.path.dirname(os.path.abspath(__file__))

loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','PDFs', 'DepressionGuide-web.pdf'))
documents  = loader.load()

# create the open-source embedding function
# Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embedding_function)

# The storage layer for the parent documents
store = InMemoryStore()

def instantiate_rag():
    rag_retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter,
    )
    rag_retriever.add_documents(documents)
    return rag_retriever