Spaces:

itachi-ai
/

Chat-Bot

Sleeping

App Files Files Community

itachi-ai commited on Mar 25

Commit

7d6888a

•

1 Parent(s): a5b7ad2

initial

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +63 -0
embed_with_db.py +59 -0
requirements.txt +0 -0
vectorize.py +54 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ venv

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+from embed_with_db import get_all_collections, VECTORDB_STORE, config
+from vectorize import VectorDataBase
+def respond(message, chat_history, collection_name):
+    chain = VECTORDB_STORE(collection_name).chain()
+    res = chain.invoke(message)
+    chat_history.append((message, res))
+    return "", chat_history
+def embed_and_store(password, collection_name, file_type, file_fields, context):
+    if password == config['PASSWORD_DB']:
+        if str(file_type)== 'string':
+            file_fields = context
+        vector_db = VectorDataBase(file_fields, collection_name, file_type)
+        vector_db.embedding_with_loop()
+        return "", ""
+    else:
+        raise Exception('Something went wrong')
+def update_interface(file_type):
+    if file_type == 'PDF' or file_type == 'TEXT':
+        return gr.Textbox(visible= False),gr.File(label = 'Select the file',interactive= True,visible= True)
+    else:
+        return gr.Textbox(visible = True, label= 'Enter the Context', interactive= True),gr.File(visible= False)
+with gr.Blocks() as demo:
+    with gr.Tab('Personal Chat bot'):
+        gr.Markdown("""
+    <div align='center'>RAG Application with Open Source models</div>
+    > You could ask anything about Me & Data Science. I hope it will find you well
+            """)
+        db_collection = gr.Dropdown(
+                list(get_all_collections().values()), label="Select Collection for the retriever",
+                  value= 'Data scientist',
+                  allow_custom_value=True)
+        chatbot = gr.Chatbot(height=480)  # Just to fit the notebook
+        msg = gr.Textbox(label="Prompt", interactive= True)
+        btn = gr.Button("Submit")
+        clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
+        btn.click(respond, inputs=[msg, chatbot, db_collection], outputs=[msg, chatbot])
+        msg.submit(respond, inputs=[msg, chatbot,db_collection], outputs=[msg, chatbot])  # Press enter to submit
+    with gr.Tab('Data Base and Embedding Store'):
+        gr.Markdown("""
+    <div align='center'>Store the Document | String in Database</div>
+    > Only admin user allowed
+            """)
+        with gr.Row():
+            password = gr.Textbox(label='Enter the Password')
+            collection_name = gr.Textbox(label='Collection Name')
+        file_type = gr.Dropdown(['PDF', 'TEXT', 'STRING'], label='Select File Type',
+                               value = 'PDF')
+        file_fields = gr.File(visible = True, interactive=True)
+        context = gr.Textbox(label="Enter the Context", visible = False)
+        btn = gr.Button("Submit")
+        btn.click(embed_and_store, inputs=[password, collection_name, file_type, file_fields, context], outputs=[file_fields, context])
+        file_type.change(update_interface, inputs=[file_type], outputs=[context, file_fields])
+gr.close_all()
+demo.launch()

embed_with_db.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from langchain_mongodb import MongoDBAtlasVectorSearch
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from pymongo import MongoClient
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain.prompts import ChatPromptTemplate
+from langchain_community.llms import HuggingFaceEndpoint
+from dotenv import dotenv_values
+config= dotenv_values()
+client = MongoClient(config['MONGODB_CONN_STRING'])
+embeddings = HuggingFaceEmbeddings(model_name= "intfloat/e5-large-v2")
+llm_model = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
+                                huggingfacehub_api_token=config['HUGGINGFACEHUB_API_TOKEN'],
+                                temperature=0.3)
+template = """
+        <s>[INST] Instruction:Your are a helpful chatbot who can answer all data science ,anime and manga questions.
+        You have to follow these rules strictly while answering the question based on context:
+        1. Do not use the word context or based on context which is provided in answers.
+        2. If there is no context you have to answer in 128 words not more than that.
+        3. context are in series format so make your own best pattern based on that give answer.
+        [/INST]
+        context:
+        {context}</s>
+        ### QUESTION:
+        {question} [/INST]
+         """
+prompt = ChatPromptTemplate.from_template(template=template)
+parser = StrOutputParser()
+def get_all_collections():
+    database = client[config['DB_NAME']]
+    names = database.list_collection_names()
+    coll_dict = {}
+    for name in names:
+        coll_dict[name] = ' '.join(str(name).capitalize().split('_'))
+    return coll_dict
+class VECTORDB_STORE:
+    def __init__(self, coll_name):
+        collection_name = self.get_collection_name(coll_name)
+        collection = client[config['DB_NAME']][collection_name]
+        self.vectordb_store = MongoDBAtlasVectorSearch(collection =collection,
+                                        embedding= embeddings,
+                                        index_name= config['VECTOR_SEARCH_INDEX'])
+    @staticmethod
+    def get_collection_name(coll_name):
+        for key, value in get_all_collections().items():
+            if coll_name == value:
+                return key
+        return None
+    def chain(self):
+        retriever = self.vectordb_store.as_retriever(search_kwargs={"k": 5})
+        chain = {'context': retriever, 'question': RunnablePassthrough()} | prompt | llm_model | parser
+        return chain

requirements.txt ADDED Viewed

Binary file (4.07 kB). View file

vectorize.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from langchain_mongodb import MongoDBAtlasVectorSearch
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from embed_with_db import embeddings, config, VECTORDB_STORE, client
+from tqdm import tqdm
+class VectorDataBase():
+    def __init__(self, file_path, db_collection, file_type='pdf', ):
+        self.file_path = file_path
+        self.file_type= file_type
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
+        self.db_collection = client[config['DB_NAME']][db_collection]
+    def load_docs_split(self):
+        if str(self.file_type).lower() == 'pdf':
+            loader = PyPDFLoader(self.file_path)
+        elif str(self.file_type).lower() == 'text':
+            loader = TextLoader(self.file_path)
+        else:
+            loader = None
+        if loader:
+            docs = loader.load()
+            return self.text_splitter.split_documents(docs)
+        else:
+            return self.text_splitter.create_documents([self.file_path])
+    def docs_embeddings(self):
+        texts = self.load_docs_split()
+        if texts:
+            docsearch = MongoDBAtlasVectorSearch.from_documents(
+                                                      texts,
+                                                      embeddings,
+                                                      collection=self.db_collection,
+                                                      index_name=config['VECTOR_SEARCH_INDEX'])
+            print('done!')
+            return docsearch
+        else:
+            print('documents is not embedded')
+            return 'Some issues'
+    @staticmethod
+    def add_collection_database(doc):
+        collection.insert_one(
+            {
+                'text': doc.page_content,
+                'embedding': embeddings.embed_query(doc.page_content),
+                'source': doc.metadata.get('source', 'Unknown'),
+                'page': doc.metadata.get('page', 0)
+            }
+        )
+    def embedding_with_loop(self):
+        docs = self.load_docs_split()
+        if docs:
+            for doc in tqdm(docs):
+                self.add_collection_database(doc)