Spaces:
Sleeping
Sleeping
File size: 4,960 Bytes
6728994 b26dc5d 6728994 fb553a2 6728994 1d1c23a 9322867 6728994 1d1c23a fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 8c24c78 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 6728994 fb553a2 ff02b0e 6728994 fb553a2 6728994 fb553a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import streamlit as st
import fitz
from PIL import Image
import tempfile
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.chains.question_answering import load_qa_chain
from docx import Document
import io
# from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint
# Ensure you have your Hugging Face token stored in an environment variable
huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
if huggingface_token is None:
raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", huggingfacehub_api_token=huggingface_token)
# Initialize conversation history list
if "conversation_history" not in st.session_state:
st.session_state.conversation_history = []
# Function to load document and perform question answering (cached)
@st.cache_data
def process_document(uploaded_file, query):
# Save uploaded file to temporary directory
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.read())
# Load document based on file type
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == ".pdf":
loader = PyPDFLoader(tmp_file.name)
document_text = None
elif file_extension == ".docx":
loader = Docx2txtLoader(tmp_file.name)
document = Document(tmp_file.name)
document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
else:
st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
return "", None
documents = loader.load()
# Load QA chain
# chain = load_qa_chain(llm=OpenAI(), verbose=True)
chain = load_qa_chain(llm=llm, verbose=True)
# Perform question answering
response = chain.invoke({"input_documents": documents, "question": query})
# Remove temporary file
os.unlink(tmp_file.name)
return response["output_text"], document_text
# Function to update conversation history
def update_conversation(query, response):
st.session_state.conversation_history.append({"question": query, "answer": response})
# Function to convert PDF pages to images
def pdf_to_images(pdf_bytes):
doc = fitz.open("pdf", pdf_bytes)
images = []
for page_num in range(doc.page_count):
page = doc[page_num]
image = page.get_pixmap()
img = Image.frombytes("RGB", [image.width, image.height], image.samples)
images.append(img)
return images
# Streamlit UI
def main():
# Set sidebar title
st.sidebar.title("7steps.AI")
st.sidebar.markdown("---")
# File uploader for document in sidebar
uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
# Display document content or images
if uploaded_file is not None:
st.title("Document Content")
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension in [".docx"]:
_, document_text = process_document(uploaded_file, "")
if document_text is not None:
st.text_area("Document Text", value=document_text, height=300)
elif file_extension == ".pdf":
images = pdf_to_images(uploaded_file.getvalue())
if images:
page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
# Download button for images
img_bytes = io.BytesIO()
images[page_number - 1].save(img_bytes, format='PNG')
st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
# Text box for new question in sidebar
query = st.sidebar.text_input("Enter your question:")
# "Ask" button in sidebar
if st.sidebar.button("Ask"):
if uploaded_file is not None:
# Process document and display response
response, _ = process_document(uploaded_file, query)
if response: # Check if response is not empty
# Update conversation history
# st.write(response)
st.write("You:", query)
st.write("AI:", response)
update_conversation(query, response)
else:
st.sidebar.write("Please upload a document first.")
# # Display conversation history
# st.title("Conversation History")
# for item in st.session_state.conversation_history:
# st.write("You:", item["question"])
# st.write("AI:", item["answer"])
# Run the application
if __name__ == "__main__":
main()
|