Shreyas094 commited on
Commit
5676761
1 Parent(s): 40983c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -47
app.py CHANGED
@@ -17,6 +17,8 @@ from huggingface_hub import InferenceClient
17
  import inspect
18
  import logging
19
  import shutil
 
 
20
 
21
 
22
  # Set up basic configuration for logging
@@ -53,6 +55,23 @@ llama_parser = LlamaParse(
53
  language="en",
54
  )
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
57
  """Loads and splits the document into pages."""
58
  if parser == "pypdf":
@@ -93,19 +112,23 @@ uploaded_documents = load_documents()
93
  def update_vectors(files, parser):
94
  global uploaded_documents
95
  logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
96
-
97
  if not files:
98
  logging.warning("No files provided for update_vectors")
99
- return "Please upload at least one PDF file.", display_documents()
100
-
101
  embed = get_embeddings()
102
  total_chunks = 0
103
-
104
  all_data = []
105
  for file in files:
106
  logging.info(f"Processing file: {file.name}")
107
  try:
108
- data = load_document(file, parser)
 
 
 
 
109
  if not data:
110
  logging.warning(f"No chunks loaded from {file.name}")
111
  continue
@@ -119,33 +142,34 @@ def update_vectors(files, parser):
119
  logging.info(f"Document already exists in uploaded_documents: {file.name}")
120
  except Exception as e:
121
  logging.error(f"Error processing file {file.name}: {str(e)}")
122
-
123
  logging.info(f"Total chunks processed: {total_chunks}")
124
-
125
  if not all_data:
126
  logging.warning("No valid data extracted from uploaded files")
127
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
128
-
129
  try:
130
- if os.path.exists("faiss_database"):
131
- logging.info("Updating existing FAISS database")
132
- database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
133
- database.add_documents(all_data)
 
134
  else:
135
- logging.info("Creating new FAISS database")
136
- database = FAISS.from_documents(all_data, embed)
137
-
138
- database.save_local("faiss_database")
139
- logging.info("FAISS database saved")
140
  except Exception as e:
141
- logging.error(f"Error updating FAISS database: {str(e)}")
142
  return f"Error updating vector store: {str(e)}", display_documents()
143
 
144
  # Save the updated list of documents
145
  save_documents(uploaded_documents)
146
 
147
  # Return a tuple with the status message and the updated document list
148
- return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
149
 
150
 
151
  def delete_documents(selected_docs):
@@ -336,45 +360,82 @@ def respond(message, history, model, temperature, num_calls, use_web_search, sel
336
  else:
337
  yield "Unable to generate a response. Please try a different query."
338
  else:
339
- # Existing PDF search logic
340
  try:
341
  embed = get_embeddings()
 
 
 
342
  if os.path.exists("faiss_database"):
343
- database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
344
- retriever = database.as_retriever(search_kwargs={"k": 20})
345
-
346
- all_relevant_docs = retriever.get_relevant_documents(message)
347
- relevant_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
348
-
349
- if not relevant_docs:
350
- yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
351
- return
352
-
353
- context_str = "\n".join([doc.page_content for doc in relevant_docs])
354
- logging.info(f"Context length: {len(context_str)}")
355
- else:
356
- context_str = "No documents available."
357
- yield "No documents available. Please upload PDF documents to answer questions."
358
  return
 
 
 
 
 
359
 
360
- if model.startswith("duckduckgo/"):
361
- # Use DuckDuckGo chat with context
362
- for partial_response in get_response_from_duckduckgo(message, model, context_str, num_calls, temperature):
363
- yield partial_response
364
- elif model == "@cf/meta/llama-3.1-8b-instruct":
365
- # Use Cloudflare API
366
- for partial_response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="pdf"):
367
- yield partial_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  else:
 
369
  # Use Hugging Face API
370
- for partial_response in get_response_from_pdf(message, model, selected_docs, num_calls=num_calls, temperature=temperature):
371
- yield partial_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  except Exception as e:
373
  logging.error(f"Error with {model}: {str(e)}")
374
  if "microsoft/Phi-3-mini-4k-instruct" in model:
375
  logging.info("Falling back to Mistral model due to Phi-3 error")
376
  fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
377
- yield from respond(message, history, fallback_model, temperature, num_calls, selected_docs)
378
  else:
379
  yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
380
 
@@ -605,7 +666,7 @@ demo = gr.ChatInterface(
605
  with demo:
606
  gr.Markdown("## Upload and Manage PDF Documents")
607
  with gr.Row():
608
- file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
609
  parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
610
  update_button = gr.Button("Upload Document")
611
  refresh_button = gr.Button("Refresh Document List")
 
17
  import inspect
18
  import logging
19
  import shutil
20
+ import pandas as pd
21
+ from docx import Document as DocxDocument
22
 
23
 
24
  # Set up basic configuration for logging
 
55
  language="en",
56
  )
57
 
58
+ def load_office_document(file: NamedTemporaryFile) -> List[Document]:
59
+ file_extension = os.path.splitext(file.name)[1].lower()
60
+ documents = []
61
+
62
+ if file_extension in ['.xlsx', '.xls']:
63
+ df = pd.read_excel(file.name)
64
+ for _, row in df.iterrows():
65
+ content = ' '.join(str(cell) for cell in row if pd.notna(cell))
66
+ documents.append(Document(page_content=content, metadata={"source": file.name}))
67
+ elif file_extension == '.docx':
68
+ doc = Document(file.name)
69
+ for para in doc.paragraphs:
70
+ if para.text.strip():
71
+ documents.append(Document(page_content=para.text, metadata={"source": file.name}))
72
+
73
+ return documents
74
+
75
  def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
76
  """Loads and splits the document into pages."""
77
  if parser == "pypdf":
 
112
  def update_vectors(files, parser):
113
  global uploaded_documents
114
  logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
115
+
116
  if not files:
117
  logging.warning("No files provided for update_vectors")
118
+ return "Please upload at least one file.", display_documents()
119
+
120
  embed = get_embeddings()
121
  total_chunks = 0
122
+
123
  all_data = []
124
  for file in files:
125
  logging.info(f"Processing file: {file.name}")
126
  try:
127
+ if file.name.lower().endswith(('.xlsx', '.xls', '.docx')):
128
+ data = load_office_document(file)
129
+ else:
130
+ data = load_document(file, parser)
131
+
132
  if not data:
133
  logging.warning(f"No chunks loaded from {file.name}")
134
  continue
 
142
  logging.info(f"Document already exists in uploaded_documents: {file.name}")
143
  except Exception as e:
144
  logging.error(f"Error processing file {file.name}: {str(e)}")
145
+
146
  logging.info(f"Total chunks processed: {total_chunks}")
147
+
148
  if not all_data:
149
  logging.warning("No valid data extracted from uploaded files")
150
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
151
+
152
  try:
153
+ # Create or update the office documents vector store
154
+ if os.path.exists("office_faiss_database"):
155
+ logging.info("Updating existing office FAISS database")
156
+ office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
157
+ office_database.add_documents(all_data)
158
  else:
159
+ logging.info("Creating new office FAISS database")
160
+ office_database = FAISS.from_documents(all_data, embed)
161
+
162
+ office_database.save_local("office_faiss_database")
163
+ logging.info("Office FAISS database saved")
164
  except Exception as e:
165
+ logging.error(f"Error updating office FAISS database: {str(e)}")
166
  return f"Error updating vector store: {str(e)}", display_documents()
167
 
168
  # Save the updated list of documents
169
  save_documents(uploaded_documents)
170
 
171
  # Return a tuple with the status message and the updated document list
172
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files.", display_documents()
173
 
174
 
175
  def delete_documents(selected_docs):
 
360
  else:
361
  yield "Unable to generate a response. Please try a different query."
362
  else:
363
+ # PDF and Office documents search logic
364
  try:
365
  embed = get_embeddings()
366
+ pdf_database = None
367
+ office_database = None
368
+
369
  if os.path.exists("faiss_database"):
370
+ pdf_database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
371
+
372
+ if os.path.exists("office_faiss_database"):
373
+ office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
374
+
375
+ if not pdf_database and not office_database:
376
+ yield "No documents available. Please upload documents to answer questions."
 
 
 
 
 
 
 
 
377
  return
378
+
379
+ all_relevant_docs = []
380
+ if pdf_database:
381
+ pdf_retriever = pdf_database.as_retriever(search_kwargs={"k": 10})
382
+ all_relevant_docs.extend(pdf_retriever.get_relevant_documents(message))
383
 
384
+ if office_database:
385
+ office_retriever = office_database.as_retriever(search_kwargs={"k": 10})
386
+ all_relevant_docs.extend(office_retriever.get_relevant_documents(message))
387
+
388
+ relevant_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
389
+
390
+ if not relevant_docs:
391
+ yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
392
+ return
393
+
394
+ context_str = "\n".join([doc.page_content for doc in relevant_docs])
395
+ logging.info(f"Total context length: {len(context_str)}")
396
+
397
+ for doc in relevant_docs:
398
+ logging.info(f"Document source: {doc.metadata['source']}")
399
+ logging.info(f"Document content preview: {doc.page_content[:100]}...") # Log first 100 characters of each document
400
+
401
+ if model == "@cf/meta/llama-3.1-8b-instruct":
402
+ logging.info("Using Cloudflare API")
403
+ # Use Cloudflare API with the retrieved context
404
+ for response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="document"):
405
+ yield response
406
  else:
407
+ logging.info("Using Hugging Face API")
408
  # Use Hugging Face API
409
+ messages = [
410
+ {"role": "system", "content": "You are a highly specialized assistant with expertise in analyzing and summarizing various types of documents including PDFs, Word documents, and Excel spreadsheets. Your goal is to provide accurate, detailed, and precise summaries based on the context provided. Avoid making assumptions or adding information that is not explicitly supported by the context from the documents."},
411
+ {"role": "user", "content": f"Using the following context from the uploaded documents:\n{context_str}\n\nPlease generate a step-by-step reasoning before arriving at a comprehensive and accurate summary addressing the following question: '{message}'. Ensure your response is strictly based on the provided context, highlighting key metrics, trends, and significant details relevant to the query. Avoid any speculative or unverified information."}
412
+ ]
413
+
414
+ client = InferenceClient(model, token=huggingface_token)
415
+
416
+ response = ""
417
+ for i in range(num_calls):
418
+ logging.info(f"API call {i+1}/{num_calls}")
419
+ for message in client.chat_completion(
420
+ messages=messages,
421
+ max_tokens=20000,
422
+ temperature=temperature,
423
+ stream=True,
424
+ top_p=0.8,
425
+ ):
426
+ if message.choices and message.choices[0].delta and message.choices[0].delta.content:
427
+ chunk = message.choices[0].delta.content
428
+ response += chunk
429
+ yield response # Yield partial response
430
+
431
+ logging.info("Finished generating response")
432
+
433
  except Exception as e:
434
  logging.error(f"Error with {model}: {str(e)}")
435
  if "microsoft/Phi-3-mini-4k-instruct" in model:
436
  logging.info("Falling back to Mistral model due to Phi-3 error")
437
  fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
438
+ yield from respond(message, history, fallback_model, temperature, num_calls, use_web_search, selected_docs)
439
  else:
440
  yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
441
 
 
666
  with demo:
667
  gr.Markdown("## Upload and Manage PDF Documents")
668
  with gr.Row():
669
+ file_input = gr.Files(label="Upload your documents", file_types=[".pdf", ".docx", ".xlsx", ".xls"])
670
  parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
671
  update_button = gr.Button("Upload Document")
672
  refresh_button = gr.Button("Refresh Document List")