Shreyas094
commited on
Commit
•
5676761
1
Parent(s):
40983c7
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,8 @@ from huggingface_hub import InferenceClient
|
|
17 |
import inspect
|
18 |
import logging
|
19 |
import shutil
|
|
|
|
|
20 |
|
21 |
|
22 |
# Set up basic configuration for logging
|
@@ -53,6 +55,23 @@ llama_parser = LlamaParse(
|
|
53 |
language="en",
|
54 |
)
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
|
57 |
"""Loads and splits the document into pages."""
|
58 |
if parser == "pypdf":
|
@@ -93,19 +112,23 @@ uploaded_documents = load_documents()
|
|
93 |
def update_vectors(files, parser):
|
94 |
global uploaded_documents
|
95 |
logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
|
96 |
-
|
97 |
if not files:
|
98 |
logging.warning("No files provided for update_vectors")
|
99 |
-
return "Please upload at least one
|
100 |
-
|
101 |
embed = get_embeddings()
|
102 |
total_chunks = 0
|
103 |
-
|
104 |
all_data = []
|
105 |
for file in files:
|
106 |
logging.info(f"Processing file: {file.name}")
|
107 |
try:
|
108 |
-
|
|
|
|
|
|
|
|
|
109 |
if not data:
|
110 |
logging.warning(f"No chunks loaded from {file.name}")
|
111 |
continue
|
@@ -119,33 +142,34 @@ def update_vectors(files, parser):
|
|
119 |
logging.info(f"Document already exists in uploaded_documents: {file.name}")
|
120 |
except Exception as e:
|
121 |
logging.error(f"Error processing file {file.name}: {str(e)}")
|
122 |
-
|
123 |
logging.info(f"Total chunks processed: {total_chunks}")
|
124 |
-
|
125 |
if not all_data:
|
126 |
logging.warning("No valid data extracted from uploaded files")
|
127 |
return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
|
128 |
-
|
129 |
try:
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
134 |
else:
|
135 |
-
logging.info("Creating new FAISS database")
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
logging.info("FAISS database saved")
|
140 |
except Exception as e:
|
141 |
-
logging.error(f"Error updating FAISS database: {str(e)}")
|
142 |
return f"Error updating vector store: {str(e)}", display_documents()
|
143 |
|
144 |
# Save the updated list of documents
|
145 |
save_documents(uploaded_documents)
|
146 |
|
147 |
# Return a tuple with the status message and the updated document list
|
148 |
-
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files
|
149 |
|
150 |
|
151 |
def delete_documents(selected_docs):
|
@@ -336,45 +360,82 @@ def respond(message, history, model, temperature, num_calls, use_web_search, sel
|
|
336 |
else:
|
337 |
yield "Unable to generate a response. Please try a different query."
|
338 |
else:
|
339 |
-
#
|
340 |
try:
|
341 |
embed = get_embeddings()
|
|
|
|
|
|
|
342 |
if os.path.exists("faiss_database"):
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
351 |
-
return
|
352 |
-
|
353 |
-
context_str = "\n".join([doc.page_content for doc in relevant_docs])
|
354 |
-
logging.info(f"Context length: {len(context_str)}")
|
355 |
-
else:
|
356 |
-
context_str = "No documents available."
|
357 |
-
yield "No documents available. Please upload PDF documents to answer questions."
|
358 |
return
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
-
if
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
else:
|
|
|
369 |
# Use Hugging Face API
|
370 |
-
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
except Exception as e:
|
373 |
logging.error(f"Error with {model}: {str(e)}")
|
374 |
if "microsoft/Phi-3-mini-4k-instruct" in model:
|
375 |
logging.info("Falling back to Mistral model due to Phi-3 error")
|
376 |
fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
|
377 |
-
yield from respond(message, history, fallback_model, temperature, num_calls, selected_docs)
|
378 |
else:
|
379 |
yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
|
380 |
|
@@ -605,7 +666,7 @@ demo = gr.ChatInterface(
|
|
605 |
with demo:
|
606 |
gr.Markdown("## Upload and Manage PDF Documents")
|
607 |
with gr.Row():
|
608 |
-
file_input = gr.Files(label="Upload your
|
609 |
parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
|
610 |
update_button = gr.Button("Upload Document")
|
611 |
refresh_button = gr.Button("Refresh Document List")
|
|
|
17 |
import inspect
|
18 |
import logging
|
19 |
import shutil
|
20 |
+
import pandas as pd
|
21 |
+
from docx import Document as DocxDocument
|
22 |
|
23 |
|
24 |
# Set up basic configuration for logging
|
|
|
55 |
language="en",
|
56 |
)
|
57 |
|
58 |
+
def load_office_document(file: NamedTemporaryFile) -> List[Document]:
|
59 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
60 |
+
documents = []
|
61 |
+
|
62 |
+
if file_extension in ['.xlsx', '.xls']:
|
63 |
+
df = pd.read_excel(file.name)
|
64 |
+
for _, row in df.iterrows():
|
65 |
+
content = ' '.join(str(cell) for cell in row if pd.notna(cell))
|
66 |
+
documents.append(Document(page_content=content, metadata={"source": file.name}))
|
67 |
+
elif file_extension == '.docx':
|
68 |
+
doc = Document(file.name)
|
69 |
+
for para in doc.paragraphs:
|
70 |
+
if para.text.strip():
|
71 |
+
documents.append(Document(page_content=para.text, metadata={"source": file.name}))
|
72 |
+
|
73 |
+
return documents
|
74 |
+
|
75 |
def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
|
76 |
"""Loads and splits the document into pages."""
|
77 |
if parser == "pypdf":
|
|
|
112 |
def update_vectors(files, parser):
|
113 |
global uploaded_documents
|
114 |
logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
|
115 |
+
|
116 |
if not files:
|
117 |
logging.warning("No files provided for update_vectors")
|
118 |
+
return "Please upload at least one file.", display_documents()
|
119 |
+
|
120 |
embed = get_embeddings()
|
121 |
total_chunks = 0
|
122 |
+
|
123 |
all_data = []
|
124 |
for file in files:
|
125 |
logging.info(f"Processing file: {file.name}")
|
126 |
try:
|
127 |
+
if file.name.lower().endswith(('.xlsx', '.xls', '.docx')):
|
128 |
+
data = load_office_document(file)
|
129 |
+
else:
|
130 |
+
data = load_document(file, parser)
|
131 |
+
|
132 |
if not data:
|
133 |
logging.warning(f"No chunks loaded from {file.name}")
|
134 |
continue
|
|
|
142 |
logging.info(f"Document already exists in uploaded_documents: {file.name}")
|
143 |
except Exception as e:
|
144 |
logging.error(f"Error processing file {file.name}: {str(e)}")
|
145 |
+
|
146 |
logging.info(f"Total chunks processed: {total_chunks}")
|
147 |
+
|
148 |
if not all_data:
|
149 |
logging.warning("No valid data extracted from uploaded files")
|
150 |
return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
|
151 |
+
|
152 |
try:
|
153 |
+
# Create or update the office documents vector store
|
154 |
+
if os.path.exists("office_faiss_database"):
|
155 |
+
logging.info("Updating existing office FAISS database")
|
156 |
+
office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
|
157 |
+
office_database.add_documents(all_data)
|
158 |
else:
|
159 |
+
logging.info("Creating new office FAISS database")
|
160 |
+
office_database = FAISS.from_documents(all_data, embed)
|
161 |
+
|
162 |
+
office_database.save_local("office_faiss_database")
|
163 |
+
logging.info("Office FAISS database saved")
|
164 |
except Exception as e:
|
165 |
+
logging.error(f"Error updating office FAISS database: {str(e)}")
|
166 |
return f"Error updating vector store: {str(e)}", display_documents()
|
167 |
|
168 |
# Save the updated list of documents
|
169 |
save_documents(uploaded_documents)
|
170 |
|
171 |
# Return a tuple with the status message and the updated document list
|
172 |
+
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files.", display_documents()
|
173 |
|
174 |
|
175 |
def delete_documents(selected_docs):
|
|
|
360 |
else:
|
361 |
yield "Unable to generate a response. Please try a different query."
|
362 |
else:
|
363 |
+
# PDF and Office documents search logic
|
364 |
try:
|
365 |
embed = get_embeddings()
|
366 |
+
pdf_database = None
|
367 |
+
office_database = None
|
368 |
+
|
369 |
if os.path.exists("faiss_database"):
|
370 |
+
pdf_database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
371 |
+
|
372 |
+
if os.path.exists("office_faiss_database"):
|
373 |
+
office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
|
374 |
+
|
375 |
+
if not pdf_database and not office_database:
|
376 |
+
yield "No documents available. Please upload documents to answer questions."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
return
|
378 |
+
|
379 |
+
all_relevant_docs = []
|
380 |
+
if pdf_database:
|
381 |
+
pdf_retriever = pdf_database.as_retriever(search_kwargs={"k": 10})
|
382 |
+
all_relevant_docs.extend(pdf_retriever.get_relevant_documents(message))
|
383 |
|
384 |
+
if office_database:
|
385 |
+
office_retriever = office_database.as_retriever(search_kwargs={"k": 10})
|
386 |
+
all_relevant_docs.extend(office_retriever.get_relevant_documents(message))
|
387 |
+
|
388 |
+
relevant_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
|
389 |
+
|
390 |
+
if not relevant_docs:
|
391 |
+
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
392 |
+
return
|
393 |
+
|
394 |
+
context_str = "\n".join([doc.page_content for doc in relevant_docs])
|
395 |
+
logging.info(f"Total context length: {len(context_str)}")
|
396 |
+
|
397 |
+
for doc in relevant_docs:
|
398 |
+
logging.info(f"Document source: {doc.metadata['source']}")
|
399 |
+
logging.info(f"Document content preview: {doc.page_content[:100]}...") # Log first 100 characters of each document
|
400 |
+
|
401 |
+
if model == "@cf/meta/llama-3.1-8b-instruct":
|
402 |
+
logging.info("Using Cloudflare API")
|
403 |
+
# Use Cloudflare API with the retrieved context
|
404 |
+
for response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="document"):
|
405 |
+
yield response
|
406 |
else:
|
407 |
+
logging.info("Using Hugging Face API")
|
408 |
# Use Hugging Face API
|
409 |
+
messages = [
|
410 |
+
{"role": "system", "content": "You are a highly specialized assistant with expertise in analyzing and summarizing various types of documents including PDFs, Word documents, and Excel spreadsheets. Your goal is to provide accurate, detailed, and precise summaries based on the context provided. Avoid making assumptions or adding information that is not explicitly supported by the context from the documents."},
|
411 |
+
{"role": "user", "content": f"Using the following context from the uploaded documents:\n{context_str}\n\nPlease generate a step-by-step reasoning before arriving at a comprehensive and accurate summary addressing the following question: '{message}'. Ensure your response is strictly based on the provided context, highlighting key metrics, trends, and significant details relevant to the query. Avoid any speculative or unverified information."}
|
412 |
+
]
|
413 |
+
|
414 |
+
client = InferenceClient(model, token=huggingface_token)
|
415 |
+
|
416 |
+
response = ""
|
417 |
+
for i in range(num_calls):
|
418 |
+
logging.info(f"API call {i+1}/{num_calls}")
|
419 |
+
for message in client.chat_completion(
|
420 |
+
messages=messages,
|
421 |
+
max_tokens=20000,
|
422 |
+
temperature=temperature,
|
423 |
+
stream=True,
|
424 |
+
top_p=0.8,
|
425 |
+
):
|
426 |
+
if message.choices and message.choices[0].delta and message.choices[0].delta.content:
|
427 |
+
chunk = message.choices[0].delta.content
|
428 |
+
response += chunk
|
429 |
+
yield response # Yield partial response
|
430 |
+
|
431 |
+
logging.info("Finished generating response")
|
432 |
+
|
433 |
except Exception as e:
|
434 |
logging.error(f"Error with {model}: {str(e)}")
|
435 |
if "microsoft/Phi-3-mini-4k-instruct" in model:
|
436 |
logging.info("Falling back to Mistral model due to Phi-3 error")
|
437 |
fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
|
438 |
+
yield from respond(message, history, fallback_model, temperature, num_calls, use_web_search, selected_docs)
|
439 |
else:
|
440 |
yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
|
441 |
|
|
|
666 |
with demo:
|
667 |
gr.Markdown("## Upload and Manage PDF Documents")
|
668 |
with gr.Row():
|
669 |
+
file_input = gr.Files(label="Upload your documents", file_types=[".pdf", ".docx", ".xlsx", ".xls"])
|
670 |
parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
|
671 |
update_button = gr.Button("Upload Document")
|
672 |
refresh_button = gr.Button("Refresh Document List")
|