itachi-ai commited on
Commit
7d6888a
1 Parent(s): a5b7ad2
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +63 -0
  3. embed_with_db.py +59 -0
  4. requirements.txt +0 -0
  5. vectorize.py +54 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ venv
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from embed_with_db import get_all_collections, VECTORDB_STORE, config
3
+ from vectorize import VectorDataBase
4
+
5
+ def respond(message, chat_history, collection_name):
6
+ chain = VECTORDB_STORE(collection_name).chain()
7
+ res = chain.invoke(message)
8
+ chat_history.append((message, res))
9
+ return "", chat_history
10
+
11
+
12
+ def embed_and_store(password, collection_name, file_type, file_fields, context):
13
+ if password == config['PASSWORD_DB']:
14
+ if str(file_type)== 'string':
15
+ file_fields = context
16
+ vector_db = VectorDataBase(file_fields, collection_name, file_type)
17
+ vector_db.embedding_with_loop()
18
+ return "", ""
19
+ else:
20
+ raise Exception('Something went wrong')
21
+ def update_interface(file_type):
22
+ if file_type == 'PDF' or file_type == 'TEXT':
23
+ return gr.Textbox(visible= False),gr.File(label = 'Select the file',interactive= True,visible= True)
24
+ else:
25
+ return gr.Textbox(visible = True, label= 'Enter the Context', interactive= True),gr.File(visible= False)
26
+
27
+ with gr.Blocks() as demo:
28
+ with gr.Tab('Personal Chat bot'):
29
+ gr.Markdown("""
30
+ <div align='center'>RAG Application with Open Source models</div>
31
+
32
+ > You could ask anything about Me & Data Science. I hope it will find you well
33
+ """)
34
+ db_collection = gr.Dropdown(
35
+ list(get_all_collections().values()), label="Select Collection for the retriever",
36
+ value= 'Data scientist',
37
+ allow_custom_value=True)
38
+ chatbot = gr.Chatbot(height=480) # Just to fit the notebook
39
+ msg = gr.Textbox(label="Prompt", interactive= True)
40
+ btn = gr.Button("Submit")
41
+ clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
42
+ btn.click(respond, inputs=[msg, chatbot, db_collection], outputs=[msg, chatbot])
43
+ msg.submit(respond, inputs=[msg, chatbot,db_collection], outputs=[msg, chatbot]) # Press enter to submit
44
+
45
+ with gr.Tab('Data Base and Embedding Store'):
46
+ gr.Markdown("""
47
+ <div align='center'>Store the Document | String in Database</div>
48
+
49
+ > Only admin user allowed
50
+ """)
51
+ with gr.Row():
52
+ password = gr.Textbox(label='Enter the Password')
53
+ collection_name = gr.Textbox(label='Collection Name')
54
+ file_type = gr.Dropdown(['PDF', 'TEXT', 'STRING'], label='Select File Type',
55
+ value = 'PDF')
56
+ file_fields = gr.File(visible = True, interactive=True)
57
+ context = gr.Textbox(label="Enter the Context", visible = False)
58
+ btn = gr.Button("Submit")
59
+
60
+ btn.click(embed_and_store, inputs=[password, collection_name, file_type, file_fields, context], outputs=[file_fields, context])
61
+ file_type.change(update_interface, inputs=[file_type], outputs=[context, file_fields])
62
+ gr.close_all()
63
+ demo.launch()
embed_with_db.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_mongodb import MongoDBAtlasVectorSearch
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from pymongo import MongoClient
4
+ from langchain_core.runnables import RunnablePassthrough
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from langchain_community.llms import HuggingFaceEndpoint
8
+ from dotenv import dotenv_values
9
+ config= dotenv_values()
10
+ client = MongoClient(config['MONGODB_CONN_STRING'])
11
+ embeddings = HuggingFaceEmbeddings(model_name= "intfloat/e5-large-v2")
12
+
13
+ llm_model = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
14
+ huggingfacehub_api_token=config['HUGGINGFACEHUB_API_TOKEN'],
15
+ temperature=0.3)
16
+
17
+ template = """
18
+ <s>[INST] Instruction:Your are a helpful chatbot who can answer all data science ,anime and manga questions.
19
+ You have to follow these rules strictly while answering the question based on context:
20
+ 1. Do not use the word context or based on context which is provided in answers.
21
+ 2. If there is no context you have to answer in 128 words not more than that.
22
+ 3. context are in series format so make your own best pattern based on that give answer.
23
+ [/INST]
24
+ context:
25
+ {context}</s>
26
+ ### QUESTION:
27
+ {question} [/INST]
28
+ """
29
+ prompt = ChatPromptTemplate.from_template(template=template)
30
+ parser = StrOutputParser()
31
+
32
+
33
+ def get_all_collections():
34
+ database = client[config['DB_NAME']]
35
+ names = database.list_collection_names()
36
+ coll_dict = {}
37
+ for name in names:
38
+ coll_dict[name] = ' '.join(str(name).capitalize().split('_'))
39
+ return coll_dict
40
+ class VECTORDB_STORE:
41
+
42
+ def __init__(self, coll_name):
43
+ collection_name = self.get_collection_name(coll_name)
44
+ collection = client[config['DB_NAME']][collection_name]
45
+ self.vectordb_store = MongoDBAtlasVectorSearch(collection =collection,
46
+ embedding= embeddings,
47
+ index_name= config['VECTOR_SEARCH_INDEX'])
48
+ @staticmethod
49
+ def get_collection_name(coll_name):
50
+ for key, value in get_all_collections().items():
51
+ if coll_name == value:
52
+ return key
53
+ return None
54
+
55
+ def chain(self):
56
+ retriever = self.vectordb_store.as_retriever(search_kwargs={"k": 5})
57
+ chain = {'context': retriever, 'question': RunnablePassthrough()} | prompt | llm_model | parser
58
+ return chain
59
+
requirements.txt ADDED
Binary file (4.07 kB). View file
 
vectorize.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_mongodb import MongoDBAtlasVectorSearch
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
4
+ from embed_with_db import embeddings, config, VECTORDB_STORE, client
5
+ from tqdm import tqdm
6
+
7
+ class VectorDataBase():
8
+ def __init__(self, file_path, db_collection, file_type='pdf', ):
9
+ self.file_path = file_path
10
+ self.file_type= file_type
11
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
12
+ self.db_collection = client[config['DB_NAME']][db_collection]
13
+ def load_docs_split(self):
14
+ if str(self.file_type).lower() == 'pdf':
15
+ loader = PyPDFLoader(self.file_path)
16
+ elif str(self.file_type).lower() == 'text':
17
+ loader = TextLoader(self.file_path)
18
+ else:
19
+ loader = None
20
+ if loader:
21
+ docs = loader.load()
22
+ return self.text_splitter.split_documents(docs)
23
+ else:
24
+ return self.text_splitter.create_documents([self.file_path])
25
+
26
+ def docs_embeddings(self):
27
+ texts = self.load_docs_split()
28
+ if texts:
29
+ docsearch = MongoDBAtlasVectorSearch.from_documents(
30
+ texts,
31
+ embeddings,
32
+ collection=self.db_collection,
33
+ index_name=config['VECTOR_SEARCH_INDEX'])
34
+ print('done!')
35
+ return docsearch
36
+ else:
37
+ print('documents is not embedded')
38
+ return 'Some issues'
39
+ @staticmethod
40
+ def add_collection_database(doc):
41
+ collection.insert_one(
42
+ {
43
+ 'text': doc.page_content,
44
+ 'embedding': embeddings.embed_query(doc.page_content),
45
+ 'source': doc.metadata.get('source', 'Unknown'),
46
+ 'page': doc.metadata.get('page', 0)
47
+ }
48
+ )
49
+ def embedding_with_loop(self):
50
+ docs = self.load_docs_split()
51
+ if docs:
52
+ for doc in tqdm(docs):
53
+ self.add_collection_database(doc)
54
+