HR_Doc / app.py
SnehaAkula's picture
Update app.py
600695a verified
raw
history blame contribute delete
No virus
4.54 kB
import os
import streamlit as st
import fitz
from PIL import Image
import tempfile
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
from docx import Document
import io
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"
# Initialize conversation history list
if "conversation_history" not in st.session_state:
st.session_state.conversation_history = []
# Function to load document and perform question answering (cached)
from docx import Document
@st.cache_data
def process_document(uploaded_file, query):
# Save uploaded file to temporary directory
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.read())
# Load document based on file type
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == ".pdf":
loader = PyPDFLoader(tmp_file.name)
document_text = None
elif file_extension == ".docx":
loader = Docx2txtLoader(tmp_file.name)
document = Document(tmp_file.name)
document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
else:
st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
return "", None
documents = loader.load()
# Load QA chain
chain = load_qa_chain(llm=OpenAI(), verbose=True)
# Perform question answering
response = chain.invoke({"input_documents": documents, "question": query})
# Remove temporary file
os.unlink(tmp_file.name)
return response["output_text"], document_text
# Function to update conversation history
def update_conversation(query, response):
st.session_state.conversation_history.append({"question": query, "answer": response})
# Function to convert PDF pages to images
def pdf_to_images(pdf_bytes):
doc = fitz.open("pdf", pdf_bytes)
images = []
for page_num in range(doc.page_count):
page = doc[page_num]
image = page.get_pixmap()
img = Image.frombytes("RGB", [image.width, image.height], image.samples)
images.append(img)
return images
# Streamlit UI
def main():
# Set sidebar title
st.sidebar.title("7steps.AI")
st.sidebar.markdown("---")
# File uploader for document in sidebar
uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
# Display document content or images
if uploaded_file is not None:
st.title("Document Content")
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension in [".docx"]:
_, document_text = process_document(uploaded_file, "")
if document_text is not None:
st.text_area("Document Text", value=document_text, height=300)
elif file_extension == ".pdf":
images = pdf_to_images(uploaded_file.getvalue())
if images:
page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
# Download button for images
img_bytes = io.BytesIO()
images[page_number - 1].save(img_bytes, format='PNG')
st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
# Text box for new question in sidebar
query = st.sidebar.text_input("Enter your question:")
# "Ask" button in sidebar
if st.sidebar.button("Ask"):
if uploaded_file is not None:
# Process document and display response
response, _ = process_document(uploaded_file, query)
if response: # Check if response is not empty
# Update conversation history
update_conversation(query, response)
else:
st.sidebar.write("Please upload a document first.")
# Display conversation history
st.title("Conversation History")
for item in st.session_state.conversation_history:
st.write("You:", item["question"])
st.write("AI:", item["answer"])
# Run the application
if __name__ == "__main__":
main()