Vincent Claes
working version with streamlit
5288ac6
import os
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
model = "gpt-3.5-turbo"
st.set_page_config(
page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
)
st.header("Randstad Digital Doc QA :robot_face:")
openai_api_key = os.environ["OPENAI_API_KEY"]
if not openai_api_key:
st.warning(
"Enter your OpenAI API key in the sidebar. You can get a key at"
" https://platform.openai.com/account/api-keys."
)
@st.cache_resource(show_spinner=False)
def load_data():
with st.spinner(
text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
):
# load the documents
loader = DirectoryLoader(
"./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
)
docs = loader.load()
# replace all new lines with spaces
for doc in docs:
setattr(doc, "page_content", doc.page_content.replace("\n", " "))
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_splits = text_splitter.split_documents(docs)
for doc in all_splits:
file_name = doc.metadata["source"]
setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")
# construct vector store
vectorstore = Chroma.from_documents(
documents=all_splits, embedding=OpenAIEmbeddings()
)
# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
# svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
return vectorstore
vectorstore = load_data()
with st.form(key="qa_form"):
query = st.text_area("Ask me anything about the documenation!")
submit = st.form_submit_button("Submit")
with st.expander("Examples"):
with st.form(key="ex1"):
ex1_query = "what is the process of raising an incident?"
if st.form_submit_button(ex1_query):
query = ex1_query
submit = True
ex2_query = "what is the release management process?"
if st.form_submit_button(ex2_query):
query = ex2_query
submit = True
ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
if st.form_submit_button(ex3_query):
query = ex3_query
submit = True
ex4_query = "What is the process?"
if st.form_submit_button(ex4_query):
query = ex4_query
submit = True
ex5_query = "What is Cx0 program management?"
if st.form_submit_button(ex5_query):
query = ex4_query
submit = True
with st.expander("Advanced Options"):
return_all_chunks = st.checkbox("Group answer per document")
def is_query_valid(query: str) -> bool:
if not query:
st.error("Please enter a question!")
return False
return True
if submit:
if not is_query_valid(query):
st.stop()
with st.spinner(text="Thinking about an answer ..."):
# Output Columns
answer_col, sources_col = st.columns(2)
# llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
return_source_documents=True,
)
SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
if return_all_chunks:
SYSTEM_MESSAGE += "Group the answer per document"
SYSTEM_MESSAGE += " \n\nQuery:\n"
result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})
with answer_col:
st.markdown("#### Answer")
st.markdown(result["result"])
with sources_col:
st.markdown("#### Sources")
lines = []
source_docs = [
(x.metadata["source"], x.page_content) for x in result["source_documents"]
]
for i, doc in enumerate(source_docs, start=1):
st.markdown(f"* CHUNK: {i}")
st.markdown(f"original doc: {doc[0]}")
st.markdown(f"{doc[1]}")
lines.append("") # for a newline between chunks