HR_Doc_RAG

Sleeping

App Files Files Community

SnehaAkula commited on Aug 5

Commit

1d1c23a

•

1 Parent(s): 9322867

Upload app.py

Browse files

Files changed (1) hide show

app.py +25 -74

app.py CHANGED Viewed

@@ -5,46 +5,27 @@ from PIL import Image
 import tempfile
 from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
 from langchain.chains.question_answering import load_qa_chain
-# from langchain_openai import OpenAI
 from docx import Document
 import io
 from langchain_community.llms import HuggingFaceHub
-import getpass
-# os.environ["GOOGLE_API_KEY"] = "AIzaSyC6o10htIT1d2DCPe8fJ09UR14qcX9EVPc"
-# from langchain_google_genai import ChatGoogleGenerativeAI
-# llm = ChatGoogleGenerativeAI(
-#     model="gemini-pro",
-#     temperature=0,
-#     max_tokens=None,
-#     timeout=None,
-#     max_retries=2,
-# )
 huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-llm=HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature":0.5, "max_length":128})
-# Set OpenAI API key
-# os.environ["OPENAI_API_KEY"] = "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"
 # Initialize conversation history list
 if "conversation_history" not in st.session_state:
     st.session_state.conversation_history = []
 # Function to load document and perform question answering (cached)
-from docx import Document
 @st.cache_data
 def process_document(uploaded_file, query):
-    # Save uploaded file to temporary directory
     with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
         tmp_file.write(uploaded_file.read())
-        # Load document based on file type
         file_extension = os.path.splitext(uploaded_file.name)[1].lower()
         if file_extension == ".pdf":
             loader = PyPDFLoader(tmp_file.name)
@@ -54,59 +35,42 @@ def process_document(uploaded_file, query):
             document = Document(tmp_file.name)
             document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
         else:
-            st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
-            return "", None
-    documents = loader.load()
-    # Load QA chain
-    # chain = load_qa_chain(llm=OpenAI(), verbose=True)
-    chain = load_qa_chain(llm=llm, verbose=True)
-    # Perform question answering
-    response = chain.invoke({"input_documents": documents, "question": query})
-    # Remove temporary file
-    os.unlink(tmp_file.name)
-    return response["output_text"]
 # Function to update conversation history
-def update_conversation(query, response):
-    st.session_state.conversation_history.append({"question": query, "answer": response})
-# Function to convert PDF pages to images
 def pdf_to_images(pdf_bytes):
-    doc = fitz.open("pdf", pdf_bytes)
     images = []
-    for page_num in range(doc.page_count):
-        page = doc[page_num]
-        image = page.get_pixmap()
-        img = Image.frombytes("RGB", [image.width, image.height], image.samples)
         images.append(img)
     return images
-# Streamlit UI
 def main():
-    # Set sidebar title
-    st.sidebar.title("7steps.AI")
-    st.sidebar.markdown("---")
-    # File uploader for document in sidebar
     uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
-    # Display document content or images
     if uploaded_file is not None:
         st.title("Document Content")
         file_extension = os.path.splitext(uploaded_file.name)[1].lower()
         if file_extension in [".docx"]:
-            _, document_text = process_document(uploaded_file, "")
             if document_text is not None:
                 st.text_area("Document Text", value=document_text, height=300)
         elif file_extension == ".pdf":
@@ -114,33 +78,20 @@ def main():
             if images:
                 page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
                 st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
-                # Download button for images
                 img_bytes = io.BytesIO()
                 images[page_number - 1].save(img_bytes, format='PNG')
                 st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
-    # Text box for new question in sidebar
     query = st.sidebar.text_input("Enter your question:")
-    # "Ask" button in sidebar
     if st.sidebar.button("Ask"):
         if uploaded_file is not None:
-            # Process document and display response
             response, _ = process_document(uploaded_file, query)
-            if response:  # Check if response is not empty
-                # Update conversation history
                 st.write(response)
                 update_conversation(query, response)
         else:
             st.sidebar.write("Please upload a document first.")
-    # # Display conversation history
-    # st.title("Conversation History")
-    # for item in st.session_state.conversation_history:
-    #     st.write("You:", item["question"])
-    #     st.write("AI:", item["answer"])
-# Run the application
 if __name__ == "__main__":
     main()

 import tempfile
 from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
 from langchain.chains.question_answering import load_qa_chain
 from docx import Document
 import io
 from langchain_community.llms import HuggingFaceHub
+# Ensure you have your Hugging Face token stored in an environment variable
 huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+if huggingface_token is None:
+    raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")
+llm = HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature": 0.5, "max_length": 128})
 # Initialize conversation history list
 if "conversation_history" not in st.session_state:
     st.session_state.conversation_history = []
 # Function to load document and perform question answering (cached)
 @st.cache_data
 def process_document(uploaded_file, query):
     with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
         tmp_file.write(uploaded_file.read())
         file_extension = os.path.splitext(uploaded_file.name)[1].lower()
         if file_extension == ".pdf":
             loader = PyPDFLoader(tmp_file.name)
             document = Document(tmp_file.name)
             document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
         else:
+            st.error("Unsupported file type")
+            return None, None
+        # Load and process the document
+        chain = load_qa_chain(llm, chain_type="stuff")
+        documents = loader.load()
+        response = chain.run(input_documents=documents, question=query)
+        return response, document_text
 # Function to update conversation history
+def update_conversation(question, answer):
+    st.session_state.conversation_history.append({"question": question, "answer": answer})
+# Function to convert PDF to images (required for PDF display)
 def pdf_to_images(pdf_bytes):
+    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
     images = []
+    for page_number in range(len(pdf_document)):
+        page = pdf_document.load_page(page_number)
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         images.append(img)
     return images
+# Main function
 def main():
+    st.title("Document Question Answering")
+    st.sidebar.title("Upload and Ask")
     uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
     if uploaded_file is not None:
         st.title("Document Content")
         file_extension = os.path.splitext(uploaded_file.name)[1].lower()
         if file_extension in [".docx"]:
+            response, document_text = process_document(uploaded_file, "")
             if document_text is not None:
                 st.text_area("Document Text", value=document_text, height=300)
         elif file_extension == ".pdf":
             if images:
                 page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
                 st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
                 img_bytes = io.BytesIO()
                 images[page_number - 1].save(img_bytes, format='PNG')
                 st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
     query = st.sidebar.text_input("Enter your question:")
     if st.sidebar.button("Ask"):
         if uploaded_file is not None:
             response, _ = process_document(uploaded_file, query)
+            if response:
                 st.write(response)
                 update_conversation(query, response)
         else:
             st.sidebar.write("Please upload a document first.")
 if __name__ == "__main__":
     main()