initial
Browse files- .gitignore +2 -0
- app.py +63 -0
- embed_with_db.py +59 -0
- requirements.txt +0 -0
- vectorize.py +54 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
venv
|
app.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from embed_with_db import get_all_collections, VECTORDB_STORE, config
|
3 |
+
from vectorize import VectorDataBase
|
4 |
+
|
5 |
+
def respond(message, chat_history, collection_name):
|
6 |
+
chain = VECTORDB_STORE(collection_name).chain()
|
7 |
+
res = chain.invoke(message)
|
8 |
+
chat_history.append((message, res))
|
9 |
+
return "", chat_history
|
10 |
+
|
11 |
+
|
12 |
+
def embed_and_store(password, collection_name, file_type, file_fields, context):
|
13 |
+
if password == config['PASSWORD_DB']:
|
14 |
+
if str(file_type)== 'string':
|
15 |
+
file_fields = context
|
16 |
+
vector_db = VectorDataBase(file_fields, collection_name, file_type)
|
17 |
+
vector_db.embedding_with_loop()
|
18 |
+
return "", ""
|
19 |
+
else:
|
20 |
+
raise Exception('Something went wrong')
|
21 |
+
def update_interface(file_type):
|
22 |
+
if file_type == 'PDF' or file_type == 'TEXT':
|
23 |
+
return gr.Textbox(visible= False),gr.File(label = 'Select the file',interactive= True,visible= True)
|
24 |
+
else:
|
25 |
+
return gr.Textbox(visible = True, label= 'Enter the Context', interactive= True),gr.File(visible= False)
|
26 |
+
|
27 |
+
with gr.Blocks() as demo:
|
28 |
+
with gr.Tab('Personal Chat bot'):
|
29 |
+
gr.Markdown("""
|
30 |
+
<div align='center'>RAG Application with Open Source models</div>
|
31 |
+
|
32 |
+
> You could ask anything about Me & Data Science. I hope it will find you well
|
33 |
+
""")
|
34 |
+
db_collection = gr.Dropdown(
|
35 |
+
list(get_all_collections().values()), label="Select Collection for the retriever",
|
36 |
+
value= 'Data scientist',
|
37 |
+
allow_custom_value=True)
|
38 |
+
chatbot = gr.Chatbot(height=480) # Just to fit the notebook
|
39 |
+
msg = gr.Textbox(label="Prompt", interactive= True)
|
40 |
+
btn = gr.Button("Submit")
|
41 |
+
clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
|
42 |
+
btn.click(respond, inputs=[msg, chatbot, db_collection], outputs=[msg, chatbot])
|
43 |
+
msg.submit(respond, inputs=[msg, chatbot,db_collection], outputs=[msg, chatbot]) # Press enter to submit
|
44 |
+
|
45 |
+
with gr.Tab('Data Base and Embedding Store'):
|
46 |
+
gr.Markdown("""
|
47 |
+
<div align='center'>Store the Document | String in Database</div>
|
48 |
+
|
49 |
+
> Only admin user allowed
|
50 |
+
""")
|
51 |
+
with gr.Row():
|
52 |
+
password = gr.Textbox(label='Enter the Password')
|
53 |
+
collection_name = gr.Textbox(label='Collection Name')
|
54 |
+
file_type = gr.Dropdown(['PDF', 'TEXT', 'STRING'], label='Select File Type',
|
55 |
+
value = 'PDF')
|
56 |
+
file_fields = gr.File(visible = True, interactive=True)
|
57 |
+
context = gr.Textbox(label="Enter the Context", visible = False)
|
58 |
+
btn = gr.Button("Submit")
|
59 |
+
|
60 |
+
btn.click(embed_and_store, inputs=[password, collection_name, file_type, file_fields, context], outputs=[file_fields, context])
|
61 |
+
file_type.change(update_interface, inputs=[file_type], outputs=[context, file_fields])
|
62 |
+
gr.close_all()
|
63 |
+
demo.launch()
|
embed_with_db.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_mongodb import MongoDBAtlasVectorSearch
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
from pymongo import MongoClient
|
4 |
+
from langchain_core.runnables import RunnablePassthrough
|
5 |
+
from langchain_core.output_parsers import StrOutputParser
|
6 |
+
from langchain.prompts import ChatPromptTemplate
|
7 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
8 |
+
from dotenv import dotenv_values
|
9 |
+
config= dotenv_values()
|
10 |
+
client = MongoClient(config['MONGODB_CONN_STRING'])
|
11 |
+
embeddings = HuggingFaceEmbeddings(model_name= "intfloat/e5-large-v2")
|
12 |
+
|
13 |
+
llm_model = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
|
14 |
+
huggingfacehub_api_token=config['HUGGINGFACEHUB_API_TOKEN'],
|
15 |
+
temperature=0.3)
|
16 |
+
|
17 |
+
template = """
|
18 |
+
<s>[INST] Instruction:Your are a helpful chatbot who can answer all data science ,anime and manga questions.
|
19 |
+
You have to follow these rules strictly while answering the question based on context:
|
20 |
+
1. Do not use the word context or based on context which is provided in answers.
|
21 |
+
2. If there is no context you have to answer in 128 words not more than that.
|
22 |
+
3. context are in series format so make your own best pattern based on that give answer.
|
23 |
+
[/INST]
|
24 |
+
context:
|
25 |
+
{context}</s>
|
26 |
+
### QUESTION:
|
27 |
+
{question} [/INST]
|
28 |
+
"""
|
29 |
+
prompt = ChatPromptTemplate.from_template(template=template)
|
30 |
+
parser = StrOutputParser()
|
31 |
+
|
32 |
+
|
33 |
+
def get_all_collections():
|
34 |
+
database = client[config['DB_NAME']]
|
35 |
+
names = database.list_collection_names()
|
36 |
+
coll_dict = {}
|
37 |
+
for name in names:
|
38 |
+
coll_dict[name] = ' '.join(str(name).capitalize().split('_'))
|
39 |
+
return coll_dict
|
40 |
+
class VECTORDB_STORE:
|
41 |
+
|
42 |
+
def __init__(self, coll_name):
|
43 |
+
collection_name = self.get_collection_name(coll_name)
|
44 |
+
collection = client[config['DB_NAME']][collection_name]
|
45 |
+
self.vectordb_store = MongoDBAtlasVectorSearch(collection =collection,
|
46 |
+
embedding= embeddings,
|
47 |
+
index_name= config['VECTOR_SEARCH_INDEX'])
|
48 |
+
@staticmethod
|
49 |
+
def get_collection_name(coll_name):
|
50 |
+
for key, value in get_all_collections().items():
|
51 |
+
if coll_name == value:
|
52 |
+
return key
|
53 |
+
return None
|
54 |
+
|
55 |
+
def chain(self):
|
56 |
+
retriever = self.vectordb_store.as_retriever(search_kwargs={"k": 5})
|
57 |
+
chain = {'context': retriever, 'question': RunnablePassthrough()} | prompt | llm_model | parser
|
58 |
+
return chain
|
59 |
+
|
requirements.txt
ADDED
Binary file (4.07 kB). View file
|
|
vectorize.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_mongodb import MongoDBAtlasVectorSearch
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
4 |
+
from embed_with_db import embeddings, config, VECTORDB_STORE, client
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
class VectorDataBase():
|
8 |
+
def __init__(self, file_path, db_collection, file_type='pdf', ):
|
9 |
+
self.file_path = file_path
|
10 |
+
self.file_type= file_type
|
11 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
|
12 |
+
self.db_collection = client[config['DB_NAME']][db_collection]
|
13 |
+
def load_docs_split(self):
|
14 |
+
if str(self.file_type).lower() == 'pdf':
|
15 |
+
loader = PyPDFLoader(self.file_path)
|
16 |
+
elif str(self.file_type).lower() == 'text':
|
17 |
+
loader = TextLoader(self.file_path)
|
18 |
+
else:
|
19 |
+
loader = None
|
20 |
+
if loader:
|
21 |
+
docs = loader.load()
|
22 |
+
return self.text_splitter.split_documents(docs)
|
23 |
+
else:
|
24 |
+
return self.text_splitter.create_documents([self.file_path])
|
25 |
+
|
26 |
+
def docs_embeddings(self):
|
27 |
+
texts = self.load_docs_split()
|
28 |
+
if texts:
|
29 |
+
docsearch = MongoDBAtlasVectorSearch.from_documents(
|
30 |
+
texts,
|
31 |
+
embeddings,
|
32 |
+
collection=self.db_collection,
|
33 |
+
index_name=config['VECTOR_SEARCH_INDEX'])
|
34 |
+
print('done!')
|
35 |
+
return docsearch
|
36 |
+
else:
|
37 |
+
print('documents is not embedded')
|
38 |
+
return 'Some issues'
|
39 |
+
@staticmethod
|
40 |
+
def add_collection_database(doc):
|
41 |
+
collection.insert_one(
|
42 |
+
{
|
43 |
+
'text': doc.page_content,
|
44 |
+
'embedding': embeddings.embed_query(doc.page_content),
|
45 |
+
'source': doc.metadata.get('source', 'Unknown'),
|
46 |
+
'page': doc.metadata.get('page', 0)
|
47 |
+
}
|
48 |
+
)
|
49 |
+
def embedding_with_loop(self):
|
50 |
+
docs = self.load_docs_split()
|
51 |
+
if docs:
|
52 |
+
for doc in tqdm(docs):
|
53 |
+
self.add_collection_database(doc)
|
54 |
+
|