SnehaAkula commited on
Commit
1d1c23a
1 Parent(s): 9322867

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -74
app.py CHANGED
@@ -5,46 +5,27 @@ from PIL import Image
5
  import tempfile
6
  from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
7
  from langchain.chains.question_answering import load_qa_chain
8
- # from langchain_openai import OpenAI
9
  from docx import Document
10
  import io
11
  from langchain_community.llms import HuggingFaceHub
12
 
13
-
14
- import getpass
15
-
16
- # os.environ["GOOGLE_API_KEY"] = "AIzaSyC6o10htIT1d2DCPe8fJ09UR14qcX9EVPc"
17
-
18
- # from langchain_google_genai import ChatGoogleGenerativeAI
19
-
20
- # llm = ChatGoogleGenerativeAI(
21
- # model="gemini-pro",
22
- # temperature=0,
23
- # max_tokens=None,
24
- # timeout=None,
25
- # max_retries=2,
26
- # )
27
-
28
  huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
29
- llm=HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature":0.5, "max_length":128})
30
 
31
- # Set OpenAI API key
32
- # os.environ["OPENAI_API_KEY"] = "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"
 
 
33
 
34
  # Initialize conversation history list
35
  if "conversation_history" not in st.session_state:
36
  st.session_state.conversation_history = []
37
 
38
  # Function to load document and perform question answering (cached)
39
- from docx import Document
40
-
41
  @st.cache_data
42
  def process_document(uploaded_file, query):
43
- # Save uploaded file to temporary directory
44
  with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
45
  tmp_file.write(uploaded_file.read())
46
-
47
- # Load document based on file type
48
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
49
  if file_extension == ".pdf":
50
  loader = PyPDFLoader(tmp_file.name)
@@ -54,59 +35,42 @@ def process_document(uploaded_file, query):
54
  document = Document(tmp_file.name)
55
  document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
56
  else:
57
- st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
58
- return "", None
59
-
60
- documents = loader.load()
61
-
62
- # Load QA chain
63
- # chain = load_qa_chain(llm=OpenAI(), verbose=True)
64
- chain = load_qa_chain(llm=llm, verbose=True)
65
-
66
- # Perform question answering
67
- response = chain.invoke({"input_documents": documents, "question": query})
68
-
69
- # Remove temporary file
70
- os.unlink(tmp_file.name)
71
-
72
- return response["output_text"]
73
-
74
-
75
-
76
 
 
 
 
 
 
77
 
78
  # Function to update conversation history
79
- def update_conversation(query, response):
80
- st.session_state.conversation_history.append({"question": query, "answer": response})
81
 
82
- # Function to convert PDF pages to images
83
  def pdf_to_images(pdf_bytes):
84
- doc = fitz.open("pdf", pdf_bytes)
85
  images = []
86
-
87
- for page_num in range(doc.page_count):
88
- page = doc[page_num]
89
- image = page.get_pixmap()
90
- img = Image.frombytes("RGB", [image.width, image.height], image.samples)
91
  images.append(img)
92
-
93
  return images
94
 
95
- # Streamlit UI
96
  def main():
97
- # Set sidebar title
98
- st.sidebar.title("7steps.AI")
99
- st.sidebar.markdown("---")
100
 
101
- # File uploader for document in sidebar
102
  uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
103
 
104
- # Display document content or images
105
  if uploaded_file is not None:
106
  st.title("Document Content")
107
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
108
  if file_extension in [".docx"]:
109
- _, document_text = process_document(uploaded_file, "")
110
  if document_text is not None:
111
  st.text_area("Document Text", value=document_text, height=300)
112
  elif file_extension == ".pdf":
@@ -114,33 +78,20 @@ def main():
114
  if images:
115
  page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
116
  st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
117
-
118
- # Download button for images
119
  img_bytes = io.BytesIO()
120
  images[page_number - 1].save(img_bytes, format='PNG')
121
  st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
122
 
123
- # Text box for new question in sidebar
124
  query = st.sidebar.text_input("Enter your question:")
125
 
126
- # "Ask" button in sidebar
127
  if st.sidebar.button("Ask"):
128
  if uploaded_file is not None:
129
- # Process document and display response
130
  response, _ = process_document(uploaded_file, query)
131
- if response: # Check if response is not empty
132
- # Update conversation history
133
  st.write(response)
134
  update_conversation(query, response)
135
  else:
136
  st.sidebar.write("Please upload a document first.")
137
 
138
- # # Display conversation history
139
- # st.title("Conversation History")
140
- # for item in st.session_state.conversation_history:
141
- # st.write("You:", item["question"])
142
- # st.write("AI:", item["answer"])
143
-
144
- # Run the application
145
  if __name__ == "__main__":
146
  main()
 
5
  import tempfile
6
  from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
7
  from langchain.chains.question_answering import load_qa_chain
 
8
  from docx import Document
9
  import io
10
  from langchain_community.llms import HuggingFaceHub
11
 
12
+ # Ensure you have your Hugging Face token stored in an environment variable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 
14
 
15
+ if huggingface_token is None:
16
+ raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")
17
+
18
+ llm = HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature": 0.5, "max_length": 128})
19
 
20
  # Initialize conversation history list
21
  if "conversation_history" not in st.session_state:
22
  st.session_state.conversation_history = []
23
 
24
  # Function to load document and perform question answering (cached)
 
 
25
  @st.cache_data
26
  def process_document(uploaded_file, query):
 
27
  with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
28
  tmp_file.write(uploaded_file.read())
 
 
29
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
30
  if file_extension == ".pdf":
31
  loader = PyPDFLoader(tmp_file.name)
 
35
  document = Document(tmp_file.name)
36
  document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
37
  else:
38
+ st.error("Unsupported file type")
39
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Load and process the document
42
+ chain = load_qa_chain(llm, chain_type="stuff")
43
+ documents = loader.load()
44
+ response = chain.run(input_documents=documents, question=query)
45
+ return response, document_text
46
 
47
  # Function to update conversation history
48
+ def update_conversation(question, answer):
49
+ st.session_state.conversation_history.append({"question": question, "answer": answer})
50
 
51
+ # Function to convert PDF to images (required for PDF display)
52
  def pdf_to_images(pdf_bytes):
53
+ pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
54
  images = []
55
+ for page_number in range(len(pdf_document)):
56
+ page = pdf_document.load_page(page_number)
57
+ pix = page.get_pixmap()
58
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
59
  images.append(img)
 
60
  return images
61
 
62
+ # Main function
63
  def main():
64
+ st.title("Document Question Answering")
65
+ st.sidebar.title("Upload and Ask")
 
66
 
 
67
  uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])
68
 
 
69
  if uploaded_file is not None:
70
  st.title("Document Content")
71
  file_extension = os.path.splitext(uploaded_file.name)[1].lower()
72
  if file_extension in [".docx"]:
73
+ response, document_text = process_document(uploaded_file, "")
74
  if document_text is not None:
75
  st.text_area("Document Text", value=document_text, height=300)
76
  elif file_extension == ".pdf":
 
78
  if images:
79
  page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
80
  st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)
 
 
81
  img_bytes = io.BytesIO()
82
  images[page_number - 1].save(img_bytes, format='PNG')
83
  st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')
84
 
 
85
  query = st.sidebar.text_input("Enter your question:")
86
 
 
87
  if st.sidebar.button("Ask"):
88
  if uploaded_file is not None:
 
89
  response, _ = process_document(uploaded_file, query)
90
+ if response:
 
91
  st.write(response)
92
  update_conversation(query, response)
93
  else:
94
  st.sidebar.write("Please upload a document first.")
95
 
 
 
 
 
 
 
 
96
  if __name__ == "__main__":
97
  main()