File size: 6,489 Bytes
0870163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c16406d
0870163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba3db0d
0870163
2d18029
 
0870163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c16406d
0870163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from pinecone import Pinecone, ServerlessSpec 
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_core.messages import SystemMessage
import os
import string
import random

pc = Pinecone( api_key=st.secrets["PINE_CONE_KEY"])
index = pc.Index('example-index')
model = SentenceTransformer('all-mpnet-base-v2')



if 'body' not in st.session_state:    
    st.session_state.body = []

def randomIdGenerate():
    ran = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 5))
    return ran



def readFiles(files):   
    st.session_state.processing = "Reading files..." 
    text = ""
    for pdf in files:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    splits = get_text_chunks(text)
    emb = embedThetext(splits)
    saveInPinecone(emb)
    return splits
    
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
    chunks = text_splitter.split_text(text)
    return chunks

def embedThetext(text):
    st.session_state.processing = "Embedding text..."   
    embeddings = model.encode(text)
    metadata_list = [{"text": s} for s in text]
    ids = [f'id-{randomIdGenerate()}' for i in range(len(text))]
    vectors = [
        {'id': id_, 'values': embedding, 'metadata': metadata}
        for id_, embedding, metadata in zip(ids, embeddings, metadata_list)
    ]
    return vectors

def saveInPinecone(vector):  
    st.session_state.processing = "Inserting to prinecone vector..."   
    index.upsert(
    vectors = vector, namespace=st.session_state.namespace
    )

def getFinalResponse(user_question):          
        query_embedding = model.encode([user_question])[0].tolist()
        result = index.query(top_k=5, namespace=st.session_state.namespace, vector=query_embedding, include_values=True,  include_metadata=True)
        response_text = result
        matched_info = ' '.join(item['metadata']['text'] for item in result['matches'])
        sources = [item['metadata']['text'] for item in result['matches']]
        context = f"Information: {matched_info} and the sources: {matched_info}"
        sys_prompt = f"""
                    Instructions:
                    - Never answer external questions
                    - Utilize the context provided for accurate and specific information.
                    - when an out of context question comes return it is out of context question. If so, strictly don't give any other information. 
                    - Don't give external data please. why are you doing so?
                    - Dont add According to the provided information.
                    - Cite your sources
                    Context: {context}
                    """
        
        prompt = ChatPromptTemplate.from_messages(
                        [
                            SystemMessage(
                                content=sys_prompt
                            ),  # This is the persistent system prompt that is always included at the start of the chat.

                            MessagesPlaceholder(
                               variable_name="chat_history"
                            ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.

                            HumanMessagePromptTemplate.from_template(
                                "{human_input}"
                            ),  # This template is where the user's current input will be injected into the prompt.
                        ]
        )
        groq_chat = ChatGroq(
                    groq_api_key=st.secrets["GROQ_API_KEY"], 
                    model_name="llama3-8b-8192"
            )
        conversation = LLMChain(
                        llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
                        prompt=prompt,  # The constructed prompt template.
                        verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
                        memory=st.session_state.memory,# The conversational memory object that stores and manages the conversation history.
                    )
        response = conversation.predict(human_input=user_question)
        st.write(response)
        return {'question': user_question, 'answer': response}

conversational_memory_length = 5
if 'memory' not in st.session_state:
    st.session_state.memory = ConversationBufferWindowMemory(k=5, memory_key="chat_history", return_messages=True)  
if 'processing' not in st.session_state:
    st.session_state.processing = 'Processing...'
if 'namespace' not in st.session_state:
    st.session_state.namespace = randomIdGenerate()
def main():
    with st.sidebar:
        st.header("Upload Multiple PDF Files Here", divider='rainbow')
        st.write("When you refresh, new namespace will be selected. So after reload the previous data is not accessable.")
        files = st.file_uploader('', accept_multiple_files=True)
        button = st.button("Process")
        if button:
            if files:
                with st.spinner(st.session_state.processing):
                    textv = readFiles(files)
                    st.success('Files Processed Successfully')
            else:
                st.error('No files selected')
                

    st.header("Chat with your PDF | RAG", divider='rainbow')
    for chat in st.session_state.body:
        with st.chat_message("user"):
            st.write(chat["question"])
        with st.chat_message("Assistant"):
            st.write(chat["answer"])
    user_question = st.chat_input('Ask Something')
    if user_question:
        st.chat_message("user").write(user_question)
        with st.spinner("Processing..."):            
            result = getFinalResponse(user_question)            
            st.session_state.body.append(result)               
            # st.experimental_rerun()       
           
if __name__ == "__main__":
    main()