SnehaAkula commited on
Commit
fb553a2
1 Parent(s): 8c24c78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -26
app.py CHANGED
@@ -7,7 +7,8 @@ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
7
  from langchain.chains.question_answering import load_qa_chain
8
  from docx import Document
9
  import io
10
- from langchain_community.llms import HuggingFaceHub
 
11
 
12
  # Ensure you have your Hugging Face token stored in an environment variable
13
  huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
@@ -15,61 +16,82 @@ huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
15
  if huggingface_token is None:
16
  raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")
17
 
18
- llm = HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature": 0.5, "max_length": 128})
19
 
20
  # Initialize conversation history list
21
  if "conversation_history" not in st.session_state:
22
  st.session_state.conversation_history = []
23
 
24
  # Function to load document and perform question answering (cached)
 
25
  @st.cache_data
26
  def process_document(uploaded_file, query):
 
27
  with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
28
  tmp_file.write(uploaded_file.read())
 
 
29
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
30
- document_text = None
31
  if file_extension == ".pdf":
32
  loader = PyPDFLoader(tmp_file.name)
 
33
  elif file_extension == ".docx":
34
  loader = Docx2txtLoader(tmp_file.name)
35
  document = Document(tmp_file.name)
36
  document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
37
  else:
38
- st.error("Unsupported file type")
39
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Load and process the document
42
- chain = load_qa_chain(llm, chain_type="stuff")
43
- documents = loader.load()
44
- response = chain.run(input_documents=documents, question=query)
45
- return response, document_text
46
 
47
  # Function to update conversation history
48
- def update_conversation(question, answer):
49
- st.session_state.conversation_history.append({"question": question, "answer": answer})
50
 
51
- # Function to convert PDF to images (required for PDF display)
52
  def pdf_to_images(pdf_bytes):
53
- pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
54
  images = []
55
- for page_number in range(len(pdf_document)):
56
- page = pdf_document.load_page(page_number)
57
- pix = page.get_pixmap()
58
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
59
  images.append(img)
 
60
  return images
61
 
62
- # Main function
63
  def main():
64
- st.title("Document Question Answering")
65
- st.sidebar.title("Upload and Ask")
 
66
 
 
67
  uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
68
 
 
69
  if uploaded_file is not None:
70
  st.title("Document Content")
71
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
72
- if file_extension == ".docx":
73
  _, document_text = process_document(uploaded_file, "")
74
  if document_text is not None:
75
  st.text_area("Document Text", value=document_text, height=300)
@@ -78,22 +100,34 @@ def main():
78
  if images:
79
  page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
80
  st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
 
 
81
  img_bytes = io.BytesIO()
82
  images[page_number - 1].save(img_bytes, format='PNG')
83
  st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
84
 
 
85
  query = st.sidebar.text_input("Enter your question:")
86
 
 
87
  if st.sidebar.button("Ask"):
88
  if uploaded_file is not None:
 
89
  response, _ = process_document(uploaded_file, query)
90
- if response:
91
- # st.write(response)
92
- st.write("You:", query)
93
- st.write("AI:", response)
94
  update_conversation(query, response)
95
  else:
96
  st.sidebar.write("Please upload a document first.")
97
 
 
 
 
 
 
 
 
98
  if __name__ == "__main__":
99
  main()
 
 
7
  from langchain.chains.question_answering import load_qa_chain
8
  from docx import Document
9
  import io
10
+ # from langchain_community.llms import HuggingFaceHub
11
+ from langchain_huggingface import HuggingFaceEndpoint
12
 
13
  # Ensure you have your Hugging Face token stored in an environment variable
14
  huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 
16
  if huggingface_token is None:
17
  raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")
18
 
19
+ llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", huggingfacehub_api_token=huggingface_token)
20
 
21
  # Initialize conversation history list
22
  if "conversation_history" not in st.session_state:
23
  st.session_state.conversation_history = []
24
 
25
  # Function to load document and perform question answering (cached)
26
+
27
  @st.cache_data
28
  def process_document(uploaded_file, query):
29
+ # Save uploaded file to temporary directory
30
  with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
31
  tmp_file.write(uploaded_file.read())
32
+
33
+ # Load document based on file type
34
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
 
35
  if file_extension == ".pdf":
36
  loader = PyPDFLoader(tmp_file.name)
37
+ document_text = None
38
  elif file_extension == ".docx":
39
  loader = Docx2txtLoader(tmp_file.name)
40
  document = Document(tmp_file.name)
41
  document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
42
  else:
43
+ st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
44
+ return "", None
45
+
46
+ documents = loader.load()
47
+
48
+ # Load QA chain
49
+ # chain = load_qa_chain(llm=OpenAI(), verbose=True)
50
+ chain = load_qa_chain(llm=llm, verbose=True)
51
+
52
+ # Perform question answering
53
+ response = chain.invoke({"input_documents": documents, "question": query})
54
+
55
+ # Remove temporary file
56
+ os.unlink(tmp_file.name)
57
+
58
+ return response["output_text"], document_text
59
+
60
+
61
+
62
 
 
 
 
 
 
63
 
64
  # Function to update conversation history
65
+ def update_conversation(query, response):
66
+ st.session_state.conversation_history.append({"question": query, "answer": response})
67
 
68
+ # Function to convert PDF pages to images
69
  def pdf_to_images(pdf_bytes):
70
+ doc = fitz.open("pdf", pdf_bytes)
71
  images = []
72
+
73
+ for page_num in range(doc.page_count):
74
+ page = doc[page_num]
75
+ image = page.get_pixmap()
76
+ img = Image.frombytes("RGB", [image.width, image.height], image.samples)
77
  images.append(img)
78
+
79
  return images
80
 
81
+ # Streamlit UI
82
  def main():
83
+ # Set sidebar title
84
+ st.sidebar.title("7steps.AI")
85
+ st.sidebar.markdown("---")
86
 
87
+ # File uploader for document in sidebar
88
  uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
89
 
90
+ # Display document content or images
91
  if uploaded_file is not None:
92
  st.title("Document Content")
93
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
94
+ if file_extension in [".docx"]:
95
  _, document_text = process_document(uploaded_file, "")
96
  if document_text is not None:
97
  st.text_area("Document Text", value=document_text, height=300)
 
100
  if images:
101
  page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
102
  st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
103
+
104
+ # Download button for images
105
  img_bytes = io.BytesIO()
106
  images[page_number - 1].save(img_bytes, format='PNG')
107
  st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
108
 
109
+ # Text box for new question in sidebar
110
  query = st.sidebar.text_input("Enter your question:")
111
 
112
+ # "Ask" button in sidebar
113
  if st.sidebar.button("Ask"):
114
  if uploaded_file is not None:
115
+ # Process document and display response
116
  response, _ = process_document(uploaded_file, query)
117
+ if response: # Check if response is not empty
118
+ # Update conversation history
119
+ st.write(response)
 
120
  update_conversation(query, response)
121
  else:
122
  st.sidebar.write("Please upload a document first.")
123
 
124
+ # # Display conversation history
125
+ # st.title("Conversation History")
126
+ # for item in st.session_state.conversation_history:
127
+ # st.write("You:", item["question"])
128
+ # st.write("AI:", item["answer"])
129
+
130
+ # Run the application
131
  if __name__ == "__main__":
132
  main()
133
+