Spaces:
Build error
Build error
kishoregajjala
commited on
Commit
•
db7706f
1
Parent(s):
30dbaa8
Upload 8 files
Browse files- README.md +2 -13
- app.py +39 -0
- llm_generator.py +161 -0
- nlp_models.py +39 -0
- rag_pipeline.py +55 -0
- rag_pipeline_vectordb.py +92 -0
- requirements.txt +17 -0
- test_vectordb.ipynb +218 -0
README.md
CHANGED
@@ -1,13 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🐠
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: blue
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.32.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# Mental_Health_Chatbot_Integrated
|
2 |
+
Mental_Health_Chatbot_Integrated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import llm_generator
|
3 |
+
from llm_generator import llm_generation
|
4 |
+
|
5 |
+
import time
|
6 |
+
|
7 |
+
# ST : https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
|
8 |
+
|
9 |
+
st.title('Mental Health Therapist')
|
10 |
+
|
11 |
+
def response_generator(response):
|
12 |
+
'''
|
13 |
+
responds the text with a type writter effect
|
14 |
+
'''
|
15 |
+
response_buffer = response.strip()
|
16 |
+
for word in response_buffer.split():
|
17 |
+
yield word + " "
|
18 |
+
time.sleep(0.1)
|
19 |
+
|
20 |
+
# Initialize chat history
|
21 |
+
if "messages" not in st.session_state:
|
22 |
+
st.session_state.messages = []
|
23 |
+
|
24 |
+
for message in st.session_state.messages:
|
25 |
+
with st.chat_message(message["role"]):
|
26 |
+
st.markdown(message["content"])
|
27 |
+
|
28 |
+
# Accept user input
|
29 |
+
if user_prompt := st.chat_input("Hello, How are you doing today"):
|
30 |
+
st.session_state.messages.append({"role": "user", "content": user_prompt})
|
31 |
+
with st.chat_message("user"):
|
32 |
+
st.markdown(user_prompt)
|
33 |
+
|
34 |
+
with st.chat_message("assistant"):
|
35 |
+
response = llm_generation(user_prompt)
|
36 |
+
time.sleep(1)
|
37 |
+
st.write_stream(response_generator(response))
|
38 |
+
|
39 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
llm_generator.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import PromptTemplate
|
2 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
3 |
+
from langchain.vectorstores import Chroma
|
4 |
+
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
5 |
+
import os
|
6 |
+
from langchain.prompts.chat import (
|
7 |
+
ChatPromptTemplate,
|
8 |
+
HumanMessagePromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
) # Docs:- https://python.langchain.com/docs/modules/model_io/prompts/message_prompts
|
11 |
+
|
12 |
+
#import chromadb
|
13 |
+
|
14 |
+
# LLM Generator
|
15 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
16 |
+
|
17 |
+
from langchain.chains import ConversationalRetrievalChain
|
18 |
+
from langchain.memory import ChatMessageHistory, ConversationSummaryBufferMemory, ConversationBufferMemory
|
19 |
+
|
20 |
+
from langchain_experimental.chat_models import Llama2Chat
|
21 |
+
# Docs:- https://python.langchain.com/docs/integrations/chat/llama2_chat
|
22 |
+
|
23 |
+
|
24 |
+
HUGGINGFACEHUB_API_TOKEN = HF_ACCESS_TOKEN
|
25 |
+
#os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
|
26 |
+
|
27 |
+
# Implement another function to pass an array of PDFs / CSVs / Excels
|
28 |
+
from rag_pipeline import instantiate_rag
|
29 |
+
retriever = instantiate_rag()
|
30 |
+
|
31 |
+
#persist_directory="Data/chroma"
|
32 |
+
#chroma_client = chromadb.PersistentClient(persist_directory=persist_directory)
|
33 |
+
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
34 |
+
#vectors = Chroma(persist_directory = persist_directory, embedding_function = embedding_function)
|
35 |
+
#retriever = vectors.as_retriever() #(k=6)
|
36 |
+
|
37 |
+
|
38 |
+
# Set the url to your Inference Endpoint below
|
39 |
+
#your_endpoint_url = "https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud"
|
40 |
+
|
41 |
+
#how you can access HuggingFaceEndpoint integration of the free Serverless Endpoints API.
|
42 |
+
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
|
43 |
+
llm = HuggingFaceEndpoint(
|
44 |
+
#endpoint_url=f"{your_endpoint_url}",
|
45 |
+
repo_id=repo_id,
|
46 |
+
#max_length=128,
|
47 |
+
max_new_tokens=512,
|
48 |
+
token=HUGGINGFACEHUB_API_TOKEN,
|
49 |
+
temperature=0.1,
|
50 |
+
repetition_penalty=1.1,
|
51 |
+
#context_length: 4096, # Set to max for Chat Summary, Llama-2 has a max context length of 4096,
|
52 |
+
stream=True,
|
53 |
+
callbacks=[StreamingStdOutCallbackHandler()],
|
54 |
+
#top_k=10,
|
55 |
+
#top_p=0.95,
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
model = Llama2Chat(llm=llm)
|
60 |
+
memory = ConversationBufferMemory(
|
61 |
+
llm=llm,
|
62 |
+
memory_key="chat_history",
|
63 |
+
return_messages=True,
|
64 |
+
output_key='answer',
|
65 |
+
input_key='question')
|
66 |
+
|
67 |
+
|
68 |
+
# Prompt Context Reference : https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF , https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ/discussions/5#64b81e9b15ebeb44419a2b9e
|
69 |
+
# Reference:- https://github.com/langchain-ai/langchain/issues/5462
|
70 |
+
|
71 |
+
system_message_template = """You're a Mental Health Specialist. Support those with Depressive Disorder.
|
72 |
+
Listen compassionately, respond helpfully. For casual talk, be friendly. For facts, use context.
|
73 |
+
If unsure, say, 'Out of my knowledge.' Always stay direct.
|
74 |
+
If you cannot find the answer from the pieces of context, just say that you don't know, don't try to make up an answer.
|
75 |
+
----------------
|
76 |
+
{context}"""
|
77 |
+
|
78 |
+
messages = [
|
79 |
+
SystemMessagePromptTemplate.from_template(system_message_template),
|
80 |
+
HumanMessagePromptTemplate.from_template("{question}")
|
81 |
+
]
|
82 |
+
qa_prompt = ChatPromptTemplate.from_messages(messages)
|
83 |
+
qa_prompt.pretty_print()
|
84 |
+
|
85 |
+
condense_question = """Given the following conversation and a follow-up message,
|
86 |
+
rephrase the follow-up message to a stand-alone question or instruction that
|
87 |
+
represents the user's intent precisely, add context needed if necessary to generate a complete and
|
88 |
+
unambiguous question, only based on the on the Follow up Question and chat history, don't make up messages.
|
89 |
+
Maintain the same question intent as the follow up input message.\n
|
90 |
+
Chat History:
|
91 |
+
{chat_history}\n
|
92 |
+
Follow Up Input: {question}
|
93 |
+
Standalone question:"""
|
94 |
+
|
95 |
+
condense_question_prompt = PromptTemplate.from_template(condense_question)
|
96 |
+
condense_question_prompt.pretty_print()
|
97 |
+
|
98 |
+
retrieval_chain = ConversationalRetrievalChain.from_llm(
|
99 |
+
llm = llm,
|
100 |
+
retriever=retriever,
|
101 |
+
memory = memory,
|
102 |
+
return_source_documents=False,
|
103 |
+
verbose=True,
|
104 |
+
#condense_question_prompt=condense_question_prompt,
|
105 |
+
# chain_type = "stuff",
|
106 |
+
combine_docs_chain_kwargs={'prompt': qa_prompt}, # https://github.com/langchain-ai/langchain/issues/6879
|
107 |
+
)
|
108 |
+
|
109 |
+
|
110 |
+
human_inputs = ['Nothing logged yet']
|
111 |
+
ai_responses = ['Nothing logged yet']
|
112 |
+
|
113 |
+
history = ChatMessageHistory()
|
114 |
+
|
115 |
+
def llm_generation(question: str):
|
116 |
+
llm_answer = retrieval_chain.invoke({'question':question, 'chat_history':history.messages})['answer'] #Answer = Dict Key = Latest response by the AI
|
117 |
+
history.add_user_message(question)
|
118 |
+
history.add_ai_message(llm_answer)
|
119 |
+
return llm_answer
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
# Decide wether to place this in streamlit.py
|
124 |
+
# or make a new post_process.py and import that to streamlit
|
125 |
+
def extract_dialogues(text):
|
126 |
+
'''
|
127 |
+
returns a two lists for human and ai dialogues,
|
128 |
+
'''
|
129 |
+
human_dialogues = []
|
130 |
+
ai_dialogues = []
|
131 |
+
lines = text.split('\n')
|
132 |
+
|
133 |
+
# Iterate through each line
|
134 |
+
for line in lines:
|
135 |
+
# Remove leading and trailing whitespace
|
136 |
+
line = line.strip()
|
137 |
+
|
138 |
+
# Check if the line starts with 'Human:' or 'AI:'
|
139 |
+
if line.startswith('Human:'):
|
140 |
+
# Extract the text after 'Human:'
|
141 |
+
human_dialogues.append(line[len('Human:'):].strip())
|
142 |
+
elif line.startswith('AI:'):
|
143 |
+
# Extract the text after 'AI:'
|
144 |
+
ai_dialogues.append(line[len('AI:'):].strip())
|
145 |
+
return human_dialogues, ai_dialogues
|
146 |
+
|
147 |
+
def update_list():
|
148 |
+
global human_inputs, ai_responses
|
149 |
+
human_responses, ai_responses = extract_dialogues(memory.buffer_as_str)
|
150 |
+
return 'responses updated'
|
151 |
+
|
152 |
+
|
153 |
+
def is_depressed():
|
154 |
+
''''
|
155 |
+
returns wether according to human inputs the person is depressed or not
|
156 |
+
'''
|
157 |
+
# Implement Classification
|
158 |
+
all_user_inputs = ''.join(human_inputs)
|
159 |
+
from nlp_models import sentiment_class, pattern_classification, corelation_analysis
|
160 |
+
is_depressed = sentiment_class(all_user_inputs)
|
161 |
+
return 'Not so depressed' if is_depressed[0][1] > 0.5 else 'is_depressed'
|
nlp_models.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import DistilBertForSequenceClassification
|
3 |
+
import os
|
4 |
+
# # Get the directory path of the current script
|
5 |
+
# script_dir = os.path.dirname(os.path.abspath(__file__))
|
6 |
+
# model = DistilBertForSequenceClassification.from_pretrained("model.safetensors")
|
7 |
+
|
8 |
+
# Load model directly
|
9 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
10 |
+
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
|
12 |
+
model = AutoModelForSequenceClassification.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
|
13 |
+
|
14 |
+
|
15 |
+
# from transformers import DistilBertTokenizerFast
|
16 |
+
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
|
17 |
+
|
18 |
+
# Move the model to the GPU if available
|
19 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20 |
+
model.to(device)
|
21 |
+
|
22 |
+
def sentiment_class(summarized_text):
|
23 |
+
'''
|
24 |
+
# 1 = non-depressed
|
25 |
+
# 0 = depressed
|
26 |
+
returns: example:- array([[0.00493283, 0.9950671 ]], dtype=float32)
|
27 |
+
'''
|
28 |
+
inputs = tokenizer(summarized_text, padding = True, truncation = True, return_tensors='pt').to('cuda')
|
29 |
+
outputs = model(**inputs)
|
30 |
+
|
31 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
32 |
+
predictions = predictions.cpu().detach().numpy()
|
33 |
+
return predictions
|
34 |
+
|
35 |
+
def pattern_classification():
|
36 |
+
return result
|
37 |
+
|
38 |
+
def corelation_analysis():
|
39 |
+
return result
|
rag_pipeline.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
2 |
+
from langchain_community.document_loaders import TextLoader
|
3 |
+
from langchain_community.embeddings.sentence_transformer import (
|
4 |
+
SentenceTransformerEmbeddings,
|
5 |
+
)
|
6 |
+
import os
|
7 |
+
from langchain.storage import InMemoryStore
|
8 |
+
from langchain_community.document_loaders import TextLoader
|
9 |
+
|
10 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
11 |
+
from langchain.retrievers import ParentDocumentRetriever
|
12 |
+
from langchain_community.vectorstores import Chroma
|
13 |
+
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
14 |
+
|
15 |
+
# Import CSV Files to the VectorDB
|
16 |
+
# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
|
17 |
+
|
18 |
+
# df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
|
19 |
+
# df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
|
20 |
+
# df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")
|
21 |
+
|
22 |
+
# Get the directory path of the current script
|
23 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
24 |
+
|
25 |
+
loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','pdf', 'Depression Help Guide.pdf'))
|
26 |
+
documents = loader.load()
|
27 |
+
|
28 |
+
# create the open-source embedding function
|
29 |
+
# Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
30 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
31 |
+
|
32 |
+
# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
|
33 |
+
|
34 |
+
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
|
35 |
+
|
36 |
+
# This text splitter is used to create the child documents
|
37 |
+
# It should create documents smaller than the parent
|
38 |
+
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
|
39 |
+
|
40 |
+
# The vectorstore to use to index the child chunks
|
41 |
+
vectorstore = Chroma(
|
42 |
+
collection_name="split_parents", embedding_function=embedding_function)
|
43 |
+
|
44 |
+
# The storage layer for the parent documents
|
45 |
+
store = InMemoryStore()
|
46 |
+
|
47 |
+
def instantiate_rag():
|
48 |
+
rag_retriever = ParentDocumentRetriever(
|
49 |
+
vectorstore=vectorstore,
|
50 |
+
docstore=store,
|
51 |
+
child_splitter=child_splitter,
|
52 |
+
parent_splitter=parent_splitter,
|
53 |
+
)
|
54 |
+
rag_retriever.add_documents(documents)
|
55 |
+
return rag_retriever
|
rag_pipeline_vectordb.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
2 |
+
from langchain_community.document_loaders import TextLoader
|
3 |
+
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
4 |
+
from langchain.storage import InMemoryStore
|
5 |
+
from langchain_community.document_loaders import TextLoader
|
6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.retrievers import ParentDocumentRetriever
|
8 |
+
from langchain_community.vectorstores import Chroma
|
9 |
+
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
10 |
+
from langchain_community.document_loaders.csv_loader import CSVLoader
|
11 |
+
import chromadb
|
12 |
+
from chromadb.utils import embedding_functions
|
13 |
+
import os
|
14 |
+
|
15 |
+
# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
|
16 |
+
|
17 |
+
|
18 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
19 |
+
|
20 |
+
persist_directory="Data/chroma"
|
21 |
+
chroma_client = chromadb.PersistentClient(path=persist_directory)
|
22 |
+
|
23 |
+
|
24 |
+
# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
|
25 |
+
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
|
26 |
+
|
27 |
+
# This text splitter is used to create the child documents
|
28 |
+
# It should create documents smaller than the parent
|
29 |
+
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
|
30 |
+
|
31 |
+
def get_file_paths_recursively(folder_path):
|
32 |
+
file_paths = []
|
33 |
+
for root, directories, files in os.walk(folder_path):
|
34 |
+
for file in files:
|
35 |
+
file_path = os.path.join(root, file)
|
36 |
+
file_paths.append(file_path)
|
37 |
+
return file_paths
|
38 |
+
|
39 |
+
def vdb_csv_loader(file_paths):
|
40 |
+
for i in range(len(file_paths)):
|
41 |
+
loader = CSVLoader(file_path=file_paths[i], encoding="latin-1")
|
42 |
+
db = Chroma.from_documents(documents=loader.load(), embedding=embedding_function, collection_name= "mental_health_csv_collection", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)
|
43 |
+
|
44 |
+
###
|
45 |
+
def generate_csv_vector_db() -> None:
|
46 |
+
|
47 |
+
# Get the directory path of the current script
|
48 |
+
#script_dir = os.path.dirname(os.path.abspath(__file__))
|
49 |
+
#folder_path = os.path.join(script_dir, 'Data/csv')
|
50 |
+
folder_path = "Data/csv"
|
51 |
+
file_paths = get_file_paths_recursively(folder_path)
|
52 |
+
|
53 |
+
#loaded all the files
|
54 |
+
vdb_csv_loader(file_paths)
|
55 |
+
|
56 |
+
###
|
57 |
+
pdf_collection = Chroma(collection_name="mental_health_pdf_collection", embedding_function=embedding_function, persist_directory=persist_directory)
|
58 |
+
def vdb_pdf_loader(file_paths):
|
59 |
+
for i in range(len(file_paths)):
|
60 |
+
loader = PyMuPDFLoader(file_path=file_paths[i])
|
61 |
+
documents = loader.load()
|
62 |
+
|
63 |
+
store = InMemoryStore()
|
64 |
+
rag_retriever = ParentDocumentRetriever(
|
65 |
+
vectorstore=pdf_collection,
|
66 |
+
docstore=store,
|
67 |
+
child_splitter=child_splitter,
|
68 |
+
parent_splitter=parent_splitter,
|
69 |
+
)
|
70 |
+
rag_retriever.add_documents(documents)
|
71 |
+
|
72 |
+
|
73 |
+
def generate_pdf_vector_db() -> None:
|
74 |
+
|
75 |
+
# Get the directory path of the current script
|
76 |
+
#script_dir = os.path.dirname(os.path.abspath(__file__))
|
77 |
+
#folder_path = os.path.join(script_dir, '/Data/pdf')
|
78 |
+
folder_path = "Data/pdf"
|
79 |
+
file_paths = get_file_paths_recursively(folder_path)
|
80 |
+
vdb_pdf_loader(file_paths)
|
81 |
+
|
82 |
+
|
83 |
+
def vectordb_load():
|
84 |
+
# call csv loader
|
85 |
+
generate_csv_vector_db()
|
86 |
+
|
87 |
+
# call PDF loader
|
88 |
+
generate_pdf_vector_db()
|
89 |
+
|
90 |
+
# call vector db load
|
91 |
+
vectordb_load()
|
92 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#create new env
|
2 |
+
#conda create --name LLM_chatbot
|
3 |
+
#activate the env
|
4 |
+
#conda activate LLM_chatbot
|
5 |
+
#pip install -r requirements.txt
|
6 |
+
#if streamlit is still unrecognized run this "conda install -c conda-forge streamlit"
|
7 |
+
#to run stremlit use streamlit run streamlit_ui.py
|
8 |
+
langchain==0.1.11
|
9 |
+
torch==2.0.1
|
10 |
+
transformers==4.36.2
|
11 |
+
langchain-community==0.0.27
|
12 |
+
streamlit==1.32.2
|
13 |
+
ctransformers==0.2.27
|
14 |
+
pymupdf==1.23.26
|
15 |
+
sentence-transformers==2.5.1
|
16 |
+
chromadb==0.4.24
|
17 |
+
langchain_experimental
|
test_vectordb.ipynb
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from langchain_community.document_loaders import PyMuPDFLoader\n",
|
10 |
+
"from langchain_community.document_loaders import TextLoader\n",
|
11 |
+
"from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
|
12 |
+
"from langchain.storage import InMemoryStore\n",
|
13 |
+
"from langchain_community.document_loaders import TextLoader\n",
|
14 |
+
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
15 |
+
"from langchain.retrievers import ParentDocumentRetriever\n",
|
16 |
+
"from langchain_community.vectorstores import Chroma\n",
|
17 |
+
"from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
|
18 |
+
"from langchain_community.document_loaders.csv_loader import CSVLoader\n",
|
19 |
+
"import chromadb\n",
|
20 |
+
"from chromadb.utils import embedding_functions\n",
|
21 |
+
"import os\n",
|
22 |
+
"\n",
|
23 |
+
"# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0\n"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 2,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [
|
31 |
+
{
|
32 |
+
"name": "stderr",
|
33 |
+
"output_type": "stream",
|
34 |
+
"text": [
|
35 |
+
"/Users/kishoregajjala/anaconda3/envs/mhc_1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
36 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
37 |
+
]
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"source": [
|
41 |
+
"# create the open-source embedding function\n",
|
42 |
+
"huggingface_ef = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
|
43 |
+
"\n"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": null,
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"\n",
|
53 |
+
"persist_directory=\"Data/chroma\"\n",
|
54 |
+
"chroma_client = chromadb.PersistentClient(path=persist_directory)\n"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": null,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"\n",
|
64 |
+
"# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever\n",
|
65 |
+
"parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n",
|
66 |
+
"\n",
|
67 |
+
"# This text splitter is used to create the child documents\n",
|
68 |
+
"# It should create documents smaller than the parent\n",
|
69 |
+
"child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [],
|
77 |
+
"source": [
|
78 |
+
"\n",
|
79 |
+
"def get_file_paths_recursively(folder_path):\n",
|
80 |
+
" file_paths = []\n",
|
81 |
+
" for root, directories, files in os.walk(folder_path):\n",
|
82 |
+
" for file in files:\n",
|
83 |
+
" file_path = os.path.join(root, file)\n",
|
84 |
+
" file_paths.append(file_path)\n",
|
85 |
+
" return file_paths\n"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": null,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [],
|
93 |
+
"source": [
|
94 |
+
"\n",
|
95 |
+
"def vdb_csv_loader(file_paths):\n",
|
96 |
+
" for i in range(len(file_paths)):\n",
|
97 |
+
" loader = CSVLoader(file_path=file_paths[i], encoding=\"latin-1\")\n",
|
98 |
+
" db = Chroma.from_documents(documents=loader.load(), embedding=huggingface_ef, collection_name= \"mental_health_csv_collection\", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)\n",
|
99 |
+
"\n",
|
100 |
+
"###\n",
|
101 |
+
"def generate_csv_vector_db() -> None:\n",
|
102 |
+
" \n",
|
103 |
+
" # Get the directory path of the current script\n",
|
104 |
+
" #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
|
105 |
+
" folder_path = \"Data/csv\"\n",
|
106 |
+
" file_paths = get_file_paths_recursively(folder_path)\n",
|
107 |
+
"\n",
|
108 |
+
" #loaded all the files\n",
|
109 |
+
" vdb_csv_loader(file_paths)\n"
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "code",
|
114 |
+
"execution_count": null,
|
115 |
+
"metadata": {},
|
116 |
+
"outputs": [],
|
117 |
+
"source": [
|
118 |
+
"\n",
|
119 |
+
"pdf_collection = Chroma(collection_name=\"mental_health_pdf_collection\", embedding_function=huggingface_ef, persist_directory=persist_directory) \n",
|
120 |
+
"def vdb_pdf_loader(file_paths):\n",
|
121 |
+
" for i in range(len(file_paths)):\n",
|
122 |
+
" loader = PyMuPDFLoader(file_path=file_paths[i])\n",
|
123 |
+
" documents = loader.load()\n",
|
124 |
+
" \n",
|
125 |
+
" store = InMemoryStore()\n",
|
126 |
+
" rag_retriever = ParentDocumentRetriever(\n",
|
127 |
+
" vectorstore=pdf_collection,\n",
|
128 |
+
" docstore=store,\n",
|
129 |
+
" child_splitter=child_splitter,\n",
|
130 |
+
" parent_splitter=parent_splitter,\n",
|
131 |
+
" )\n",
|
132 |
+
" rag_retriever.add_documents(documents)\n",
|
133 |
+
"\n",
|
134 |
+
"\n",
|
135 |
+
"def generate_pdf_vector_db() -> None:\n",
|
136 |
+
" \n",
|
137 |
+
" # Get the directory path of the current script\n",
|
138 |
+
" #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
|
139 |
+
" folder_path = \"Data/pdf\" #os.path.join(script_dir, '/Data/pdf') \n",
|
140 |
+
" file_paths = get_file_paths_recursively(folder_path)\n",
|
141 |
+
" vdb_pdf_loader(file_paths)\n",
|
142 |
+
"\n",
|
143 |
+
"\n",
|
144 |
+
"\n"
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "code",
|
149 |
+
"execution_count": null,
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
" # call PDF loader\n",
|
154 |
+
"generate_pdf_vector_db()\n"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": null,
|
160 |
+
"metadata": {},
|
161 |
+
"outputs": [],
|
162 |
+
"source": [
|
163 |
+
"# call csv loader\n",
|
164 |
+
"generate_csv_vector_db()"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": null,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [],
|
172 |
+
"source": [
|
173 |
+
"\n",
|
174 |
+
"\n",
|
175 |
+
"def vectordb_load(): \n",
|
176 |
+
" # call csv loader\n",
|
177 |
+
" generate_csv_vector_db()\n",
|
178 |
+
"\n",
|
179 |
+
" # call PDF loader\n",
|
180 |
+
" generate_pdf_vector_db()\n",
|
181 |
+
"\n",
|
182 |
+
" \n"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": null,
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [],
|
190 |
+
"source": [
|
191 |
+
"\n",
|
192 |
+
"# call vector db load\n",
|
193 |
+
"vectordb_load()\n"
|
194 |
+
]
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"metadata": {
|
198 |
+
"kernelspec": {
|
199 |
+
"display_name": "mhc_1",
|
200 |
+
"language": "python",
|
201 |
+
"name": "python3"
|
202 |
+
},
|
203 |
+
"language_info": {
|
204 |
+
"codemirror_mode": {
|
205 |
+
"name": "ipython",
|
206 |
+
"version": 3
|
207 |
+
},
|
208 |
+
"file_extension": ".py",
|
209 |
+
"mimetype": "text/x-python",
|
210 |
+
"name": "python",
|
211 |
+
"nbconvert_exporter": "python",
|
212 |
+
"pygments_lexer": "ipython3",
|
213 |
+
"version": "3.11.8"
|
214 |
+
}
|
215 |
+
},
|
216 |
+
"nbformat": 4,
|
217 |
+
"nbformat_minor": 2
|
218 |
+
}
|