ArturG9's picture
Update app.py
8001400 verified
raw
history blame
5.63 kB
import os
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain_community.llms import llamacpp
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
from langchain.prompts import PromptTemplate,SystemMessagePromptTemplate,ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain, ConversationalRetrievalChain
from langchain.text_splitter import TokenTextSplitter,RecursiveCharacterTextSplitter
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from htmlTemplates import css, bot_template, user_template
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
lang_api_key = os.getenv("lang_api_key")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_API_KEY"] = lang_api_key
os.environ["LANGCHAIN_PROJECT"] = "Chat with multiple PDFs"
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=250, chunk_overlap=50,
separators=["\n \n \n", "\n \n", "\n1", "(?<=\. )", " ", ""],
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
model_name = "Alibaba-NLP/gte-base-en-v1.5"
model_kwargs = {'device': 'cpu',
"trust_remote_code" : 'True'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
vectorstore = Chroma.from_texts(
texts=text_chunks, embedding=embeddings, persist_directory="docs/chroma/")
return vectorstore
def get_conversation_chain(vectorstore):
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = llamacpp.LlamaCpp(
model_path="qwen2-0_5b-instruct-q8_0.gguf",
n_gpu_layers=0,
temperature=0.1,
top_p = 0.9,
n_ctx=20000,
n_batch=2000,
max_tokens = 300,
repeat_penalty=1.9,
last_n_tokens_size = 300,
#callback_manager=callback_manager,
verbose=False,
)
retriever = vectorstore.as_retriever(search_type='mmr', k=7)
prompt = hub.pull("rlm/rag-prompt")
rag_chain = ({"context": retriever} | prompt | llm | StrOutputParser())
return rag_chain
def main():
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
st.write(css, unsafe_allow_html=True)
st.header("Chat with multiple PDFs :books:")
if user_question := st.text_input("Ask a question about your documents:"):
handle_userinput(user_question)
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)
# get the text chunks
text_chunks = get_text_chunks(raw_text)
# create vector store
vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore)
def handle_userinput(user_question ):
if "chat_history" not in st.session_state:
st.session_state["chat_history"] = [
{"role": "assistant", "content": "Hi, I'm a Q&A chatbot who is based on your imported pdf documents . How can I help you?"}
]
st.session_state.chat_history.append({"role": "user", "content": user_question})
# Invoke conversation chain
response = st.session_state.conversation.invoke({"question": user_question})
st.session_state.chat_history.append({"role": "assistant", "content": response})
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message['content']), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message['content']), unsafe_allow_html=True)
st.subheader("Your documents")
for doc in docs:
st.write(f"Document: {doc}")
if __name__ == '__main__':
main()