Sentinel-AI-Beta-Test

Running

App Files Files Community

Shreyas094 commited on Aug 28

Commit

5676761

•

1 Parent(s): 40983c7

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -47

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ from huggingface_hub import InferenceClient
 import inspect
 import logging
 import shutil
 # Set up basic configuration for logging
@@ -53,6 +55,23 @@ llama_parser = LlamaParse(
     language="en",
 )
 def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
     """Loads and splits the document into pages."""
     if parser == "pypdf":
@@ -93,19 +112,23 @@ uploaded_documents = load_documents()
 def update_vectors(files, parser):
     global uploaded_documents
     logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
     if not files:
         logging.warning("No files provided for update_vectors")
-        return "Please upload at least one PDF file.", display_documents()
     embed = get_embeddings()
     total_chunks = 0
     all_data = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
-            data = load_document(file, parser)
             if not data:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
@@ -119,33 +142,34 @@ def update_vectors(files, parser):
                 logging.info(f"Document already exists in uploaded_documents: {file.name}")
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
     logging.info(f"Total chunks processed: {total_chunks}")
     if not all_data:
         logging.warning("No valid data extracted from uploaded files")
         return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
     try:
-        if os.path.exists("faiss_database"):
-            logging.info("Updating existing FAISS database")
-            database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-            database.add_documents(all_data)
         else:
-            logging.info("Creating new FAISS database")
-            database = FAISS.from_documents(all_data, embed)
-        database.save_local("faiss_database")
-        logging.info("FAISS database saved")
     except Exception as e:
-        logging.error(f"Error updating FAISS database: {str(e)}")
         return f"Error updating vector store: {str(e)}", display_documents()
     # Save the updated list of documents
     save_documents(uploaded_documents)
     # Return a tuple with the status message and the updated document list
-    return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 def delete_documents(selected_docs):
@@ -336,45 +360,82 @@ def respond(message, history, model, temperature, num_calls, use_web_search, sel
         else:
             yield "Unable to generate a response. Please try a different query."
     else:
-        # Existing PDF search logic
         try:
             embed = get_embeddings()
             if os.path.exists("faiss_database"):
-                database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-                retriever = database.as_retriever(search_kwargs={"k": 20})
-                all_relevant_docs = retriever.get_relevant_documents(message)
-                relevant_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
-                if not relevant_docs:
-                    yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
-                    return
-                context_str = "\n".join([doc.page_content for doc in relevant_docs])
-                logging.info(f"Context length: {len(context_str)}")
-            else:
-                context_str = "No documents available."
-                yield "No documents available. Please upload PDF documents to answer questions."
                 return
-            if model.startswith("duckduckgo/"):
-                # Use DuckDuckGo chat with context
-                for partial_response in get_response_from_duckduckgo(message, model, context_str, num_calls, temperature):
-                    yield partial_response
-            elif model == "@cf/meta/llama-3.1-8b-instruct":
-                # Use Cloudflare API
-                for partial_response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="pdf"):
-                    yield partial_response
             else:
                 # Use Hugging Face API
-                for partial_response in get_response_from_pdf(message, model, selected_docs, num_calls=num_calls, temperature=temperature):
-                    yield partial_response
         except Exception as e:
             logging.error(f"Error with {model}: {str(e)}")
             if "microsoft/Phi-3-mini-4k-instruct" in model:
                 logging.info("Falling back to Mistral model due to Phi-3 error")
                 fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
-                yield from respond(message, history, fallback_model, temperature, num_calls, selected_docs)
             else:
                 yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
@@ -605,7 +666,7 @@ demo = gr.ChatInterface(
 with demo:
     gr.Markdown("## Upload and Manage PDF Documents")
     with gr.Row():
-        file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
         parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
         update_button = gr.Button("Upload Document")
         refresh_button = gr.Button("Refresh Document List")

 import inspect
 import logging
 import shutil
+import pandas as pd
+from docx import Document as DocxDocument
 # Set up basic configuration for logging
     language="en",
 )
+def load_office_document(file: NamedTemporaryFile) -> List[Document]:
+    file_extension = os.path.splitext(file.name)[1].lower()
+    documents = []
+    if file_extension in ['.xlsx', '.xls']:
+        df = pd.read_excel(file.name)
+        for _, row in df.iterrows():
+            content = ' '.join(str(cell) for cell in row if pd.notna(cell))
+            documents.append(Document(page_content=content, metadata={"source": file.name}))
+    elif file_extension == '.docx':
+        doc = Document(file.name)
+        for para in doc.paragraphs:
+            if para.text.strip():
+                documents.append(Document(page_content=para.text, metadata={"source": file.name}))
+    return documents
 def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
     """Loads and splits the document into pages."""
     if parser == "pypdf":
 def update_vectors(files, parser):
     global uploaded_documents
     logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
     if not files:
         logging.warning("No files provided for update_vectors")
+        return "Please upload at least one file.", display_documents()
     embed = get_embeddings()
     total_chunks = 0
     all_data = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
+            if file.name.lower().endswith(('.xlsx', '.xls', '.docx')):
+                data = load_office_document(file)
+            else:
+                data = load_document(file, parser)
             if not data:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
                 logging.info(f"Document already exists in uploaded_documents: {file.name}")
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
     logging.info(f"Total chunks processed: {total_chunks}")
     if not all_data:
         logging.warning("No valid data extracted from uploaded files")
         return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
     try:
+        # Create or update the office documents vector store
+        if os.path.exists("office_faiss_database"):
+            logging.info("Updating existing office FAISS database")
+            office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
+            office_database.add_documents(all_data)
         else:
+            logging.info("Creating new office FAISS database")
+            office_database = FAISS.from_documents(all_data, embed)
+        office_database.save_local("office_faiss_database")
+        logging.info("Office FAISS database saved")
     except Exception as e:
+        logging.error(f"Error updating office FAISS database: {str(e)}")
         return f"Error updating vector store: {str(e)}", display_documents()
     # Save the updated list of documents
     save_documents(uploaded_documents)
     # Return a tuple with the status message and the updated document list
+    return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files.", display_documents()
 def delete_documents(selected_docs):
         else:
             yield "Unable to generate a response. Please try a different query."
     else:
+        # PDF and Office documents search logic
         try:
             embed = get_embeddings()
+            pdf_database = None
+            office_database = None
             if os.path.exists("faiss_database"):
+                pdf_database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+            if os.path.exists("office_faiss_database"):
+                office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
+            if not pdf_database and not office_database:
+                yield "No documents available. Please upload documents to answer questions."
                 return
+            all_relevant_docs = []
+            if pdf_database:
+                pdf_retriever = pdf_database.as_retriever(search_kwargs={"k": 10})
+                all_relevant_docs.extend(pdf_retriever.get_relevant_documents(message))
+            if office_database:
+                office_retriever = office_database.as_retriever(search_kwargs={"k": 10})
+                all_relevant_docs.extend(office_retriever.get_relevant_documents(message))
+            relevant_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
+            if not relevant_docs:
+                yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
+                return
+            context_str = "\n".join([doc.page_content for doc in relevant_docs])
+            logging.info(f"Total context length: {len(context_str)}")
+            for doc in relevant_docs:
+                logging.info(f"Document source: {doc.metadata['source']}")
+                logging.info(f"Document content preview: {doc.page_content[:100]}...")  # Log first 100 characters of each document
+            if model == "@cf/meta/llama-3.1-8b-instruct":
+                logging.info("Using Cloudflare API")
+                # Use Cloudflare API with the retrieved context
+                for response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="document"):
+                    yield response
             else:
+                logging.info("Using Hugging Face API")
                 # Use Hugging Face API
+                messages = [
+                    {"role": "system", "content": "You are a highly specialized assistant with expertise in analyzing and summarizing various types of documents including PDFs, Word documents, and Excel spreadsheets. Your goal is to provide accurate, detailed, and precise summaries based on the context provided. Avoid making assumptions or adding information that is not explicitly supported by the context from the documents."},
+                    {"role": "user", "content": f"Using the following context from the uploaded documents:\n{context_str}\n\nPlease generate a step-by-step reasoning before arriving at a comprehensive and accurate summary addressing the following question: '{message}'. Ensure your response is strictly based on the provided context, highlighting key metrics, trends, and significant details relevant to the query. Avoid any speculative or unverified information."}
+                ]
+                client = InferenceClient(model, token=huggingface_token)
+                response = ""
+                for i in range(num_calls):
+                    logging.info(f"API call {i+1}/{num_calls}")
+                    for message in client.chat_completion(
+                        messages=messages,
+                        max_tokens=20000,
+                        temperature=temperature,
+                        stream=True,
+                        top_p=0.8,
+                    ):
+                        if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                            chunk = message.choices[0].delta.content
+                            response += chunk
+                            yield response  # Yield partial response
+                logging.info("Finished generating response")
         except Exception as e:
             logging.error(f"Error with {model}: {str(e)}")
             if "microsoft/Phi-3-mini-4k-instruct" in model:
                 logging.info("Falling back to Mistral model due to Phi-3 error")
                 fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
+                yield from respond(message, history, fallback_model, temperature, num_calls, use_web_search, selected_docs)
             else:
                 yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
 with demo:
     gr.Markdown("## Upload and Manage PDF Documents")
     with gr.Row():
+        file_input = gr.Files(label="Upload your documents", file_types=[".pdf", ".docx", ".xlsx", ".xls"])
         parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
         update_button = gr.Button("Upload Document")
         refresh_button = gr.Button("Refresh Document List")