Spaces:
Runtime error
Runtime error
import os | |
import streamlit as st | |
from langchain.chains import RetrievalQA | |
from langchain.chat_models import ChatOpenAI | |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
model = "gpt-3.5-turbo" | |
st.set_page_config( | |
page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide" | |
) | |
st.header("Randstad Digital Doc QA :robot_face:") | |
openai_api_key = os.environ["OPENAI_API_KEY"] | |
if not openai_api_key: | |
st.warning( | |
"Enter your OpenAI API key in the sidebar. You can get a key at" | |
" https://platform.openai.com/account/api-keys." | |
) | |
def load_data(): | |
with st.spinner( | |
text="Loading and indexing the documents – hang tight! This should take 1-2 minutes." | |
): | |
# load the documents | |
loader = DirectoryLoader( | |
"./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader | |
) | |
docs = loader.load() | |
# replace all new lines with spaces | |
for doc in docs: | |
setattr(doc, "page_content", doc.page_content.replace("\n", " ")) | |
# split the documents into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
all_splits = text_splitter.split_documents(docs) | |
for doc in all_splits: | |
file_name = doc.metadata["source"] | |
setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}") | |
# construct vector store | |
vectorstore = Chroma.from_documents( | |
documents=all_splits, embedding=OpenAIEmbeddings() | |
) | |
# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3 | |
# svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings()) | |
return vectorstore | |
vectorstore = load_data() | |
with st.form(key="qa_form"): | |
query = st.text_area("Ask me anything about the documenation!") | |
submit = st.form_submit_button("Submit") | |
with st.expander("Examples"): | |
with st.form(key="ex1"): | |
ex1_query = "what is the process of raising an incident?" | |
if st.form_submit_button(ex1_query): | |
query = ex1_query | |
submit = True | |
ex2_query = "what is the release management process?" | |
if st.form_submit_button(ex2_query): | |
query = ex2_query | |
submit = True | |
ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?" | |
if st.form_submit_button(ex3_query): | |
query = ex3_query | |
submit = True | |
ex4_query = "What is the process?" | |
if st.form_submit_button(ex4_query): | |
query = ex4_query | |
submit = True | |
ex5_query = "What is Cx0 program management?" | |
if st.form_submit_button(ex5_query): | |
query = ex4_query | |
submit = True | |
with st.expander("Advanced Options"): | |
return_all_chunks = st.checkbox("Group answer per document") | |
def is_query_valid(query: str) -> bool: | |
if not query: | |
st.error("Please enter a question!") | |
return False | |
return True | |
if submit: | |
if not is_query_valid(query): | |
st.stop() | |
with st.spinner(text="Thinking about an answer ..."): | |
# Output Columns | |
answer_col, sources_col = st.columns(2) | |
# llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0) | |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}), | |
return_source_documents=True, | |
) | |
SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points." | |
if return_all_chunks: | |
SYSTEM_MESSAGE += "Group the answer per document" | |
SYSTEM_MESSAGE += " \n\nQuery:\n" | |
result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"}) | |
with answer_col: | |
st.markdown("#### Answer") | |
st.markdown(result["result"]) | |
with sources_col: | |
st.markdown("#### Sources") | |
lines = [] | |
source_docs = [ | |
(x.metadata["source"], x.page_content) for x in result["source_documents"] | |
] | |
for i, doc in enumerate(source_docs, start=1): | |
st.markdown(f"* CHUNK: {i}") | |
st.markdown(f"original doc: {doc[0]}") | |
st.markdown(f"{doc[1]}") | |
lines.append("") # for a newline between chunks | |