Spaces:
Runtime error
Runtime error
File size: 4,765 Bytes
5288ac6 9c2548e 611aebd 5288ac6 611aebd 5288ac6 611aebd 5288ac6 611aebd 5288ac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
model = "gpt-3.5-turbo"
st.set_page_config(
page_title="Randstad Digital Doc QA", page_icon=":robot_face:", layout="wide"
)
st.header("Randstad Digital Doc QA :robot_face:")
openai_api_key = os.environ["OPENAI_API_KEY"]
if not openai_api_key:
st.warning(
"Enter your OpenAI API key in the sidebar. You can get a key at"
" https://platform.openai.com/account/api-keys."
)
@st.cache_resource(show_spinner=False)
def load_data():
with st.spinner(
text="Loading and indexing the documents – hang tight! This should take 1-2 minutes."
):
# load the documents
loader = DirectoryLoader(
"./data", glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
)
docs = loader.load()
# replace all new lines with spaces
for doc in docs:
setattr(doc, "page_content", doc.page_content.replace("\n", " "))
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_splits = text_splitter.split_documents(docs)
for doc in all_splits:
file_name = doc.metadata["source"]
setattr(doc, "page_content", f"document: {file_name}\n{doc.page_content}")
# construct vector store
vectorstore = Chroma.from_documents(
documents=all_splits, embedding=OpenAIEmbeddings()
)
# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
# svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
return vectorstore
vectorstore = load_data()
with st.form(key="qa_form"):
query = st.text_area("Ask me anything about the documenation!")
submit = st.form_submit_button("Submit")
with st.expander("Examples"):
with st.form(key="ex1"):
ex1_query = "what is the process of raising an incident?"
if st.form_submit_button(ex1_query):
query = ex1_query
submit = True
ex2_query = "what is the release management process?"
if st.form_submit_button(ex2_query):
query = ex2_query
submit = True
ex3_query = "What is process for identifying risks that can impact the desired outcomes of a project?"
if st.form_submit_button(ex3_query):
query = ex3_query
submit = True
ex4_query = "What is the process?"
if st.form_submit_button(ex4_query):
query = ex4_query
submit = True
ex5_query = "What is Cx0 program management?"
if st.form_submit_button(ex5_query):
query = ex4_query
submit = True
with st.expander("Advanced Options"):
return_all_chunks = st.checkbox("Group answer per document")
def is_query_valid(query: str) -> bool:
if not query:
st.error("Please enter a question!")
return False
return True
if submit:
if not is_query_valid(query):
st.stop()
with st.spinner(text="Thinking about an answer ..."):
# Output Columns
answer_col, sources_col = st.columns(2)
# llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
return_source_documents=True,
)
SYSTEM_MESSAGE = "You are an internal document expert and you respond to the query in 1 to 5 sentences. If the answer is a list, write bullet points."
if return_all_chunks:
SYSTEM_MESSAGE += "Group the answer per document"
SYSTEM_MESSAGE += " \n\nQuery:\n"
result = qa_chain({"query": f"{SYSTEM_MESSAGE}{query}"})
with answer_col:
st.markdown("#### Answer")
st.markdown(result["result"])
with sources_col:
st.markdown("#### Sources")
lines = []
source_docs = [
(x.metadata["source"], x.page_content) for x in result["source_documents"]
]
for i, doc in enumerate(source_docs, start=1):
st.markdown(f"* CHUNK: {i}")
st.markdown(f"original doc: {doc[0]}")
st.markdown(f"{doc[1]}")
lines.append("") # for a newline between chunks
|