Sentinel-AI-Beta-Test

Running

App Files Files Community

Shreyas094 commited on Aug 12

Commit

6b3b427

•

1 Parent(s): 3449685

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -66

app.py CHANGED Viewed

@@ -67,7 +67,7 @@ def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[
         raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
 def get_embeddings():
-    return HuggingFaceEmbeddings(model_name="avsolatorio/GIST-Embedding-v0")
 # Add this at the beginning of your script, after imports
 DOCUMENTS_FILE = "uploaded_documents.json"
@@ -271,10 +271,33 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
     print(f"Final clean response: {final_response[:100]}...")
     return final_response
-def duckduckgo_search(query):
-    with DDGS() as ddgs:
-        results = ddgs.text(query, max_results=10)
-    return results
 class CitingSources(BaseModel):
     sources: List[str] = Field(
@@ -376,7 +399,7 @@ def get_context_for_summary(selected_docs):
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-        retriever = database.as_retriever(search_kwargs={"k": 10})  # Retrieve top 5 most relevant chunks
         # Create a generic query that covers common financial summary topics
         generic_query = "financial performance revenue profit assets liabilities cash flow key metrics highlights"
@@ -409,36 +432,6 @@ def get_context_for_query(query, selected_docs):
     else:
         return "No documents available to answer the query."
-def validate_response(initial_response, context, query, model, temperature=0.1):
-    validation_prompt = f"""Given the following context and initial response to the query "{query}":
-Context:
-{context}
-Initial Response:
-{initial_response}
-You are an expert assistant tasked with carefully validating the initial response against the provided context. Remove any hallucinations, irrelevant details, or factually incorrect information. Generate a revised response that is accurate and directly supported by the context. If any information cannot be verified from the context, explicitly state that it could not be confirmed. After writing the revised response, provide a list of all sources used.
-Revised Response:
-"""
-    if model == "@cf/meta/llama-3.1-8b-instruct":
-        return get_response_from_cloudflare(prompt=validation_prompt, context="", query="", num_calls=1, temperature=temperature, search_type="validation")
-    else:
-        client = InferenceClient(model, token=huggingface_token)
-        revised_response = ""
-        for message in client.chat_completion(
-            messages=[{"role": "user", "content": validation_prompt}],
-            max_tokens=10000,
-            temperature=temperature,
-            stream=True,
-        ):
-            if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                chunk = message.choices[0].delta.content
-                revised_response += chunk
-                yield revised_response
 def get_response_from_cloudflare(prompt, context, query, num_calls=3, temperature=0.2, search_type="pdf"):
     headers = {
         "Authorization": f"Bearer {API_TOKEN}",
@@ -450,19 +443,15 @@ def get_response_from_cloudflare(prompt, context, query, num_calls=3, temperatur
         instruction = f"""Using the following context from the PDF documents:
 {context}
 Write a detailed and complete response that answers the following user question: '{query}'"""
-    elif search_type == "web":
         instruction = f"""Using the following context:
 {context}
 Write a detailed and complete research document that fulfills the following user request: '{query}'
 After writing the document, please provide a list of sources used in your response."""
-    elif search_type == "validation":
-        instruction = prompt  # For validation, use the provided prompt directly
-    else:
-        raise ValueError("Invalid search_type")
     inputs = [
         {"role": "system", "content": instruction},
-        {"role": "user", "content": query if search_type != "validation" else ""}
     ]
     payload = {
@@ -509,35 +498,30 @@ def create_web_search_vectors(search_results):
     return FAISS.from_documents(documents, embed)
-def get_response_with_search(query, model, num_calls=3, temperature=0.1):
-    search_results = duckduckgo_search(query)
-    web_search_database = create_web_search_vectors(search_results)
-    if not web_search_database:
-        yield "No web search results available. Please try again.", ""
-        return
-    retriever = web_search_database.as_retriever(search_kwargs={"k": 10})
-    relevant_docs = retriever.get_relevant_documents(query)
-    context = "\n".join([doc.page_content for doc in relevant_docs])
     prompt = f"""Using the following context from web search results:
 {context}
-You are an expert assistant tasked with creating a detailed and comprehensive research document in response to the following user query: '{query}'
-Base your entire response strictly on the information retrieved from trusted sources. After completing the document, provide a list of all sources used.
-Importantly, only include information that is directly supported by the retrieved content.
-If any part of the information cannot be verified from the given sources, clearly state that it could not be confirmed."""
-    initial_response = ""
     if model == "@cf/meta/llama-3.1-8b-instruct":
         # Use Cloudflare API
         for response in get_response_from_cloudflare(prompt="", context=context, query=query, num_calls=num_calls, temperature=temperature, search_type="web"):
-            initial_response = response
     else:
         # Use Hugging Face API
         client = InferenceClient(model, token=huggingface_token)
         for i in range(num_calls):
             for message in client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
@@ -547,17 +531,14 @@ If any part of the information cannot be verified from the given sources, clearl
             ):
                 if message.choices and message.choices[0].delta and message.choices[0].delta.content:
                     chunk = message.choices[0].delta.content
-                    initial_response += chunk
-    # Validation step
-    for revised_response in validate_response(initial_response, context, query, model, temperature):
-        yield revised_response, ""  # Yield streaming revised response without sources
 INSTRUCTION_PROMPTS = {
-    "Asset Managers": "Focus on the Management Discussion and Analysis and Financial Statements sections. Summarize key financial metrics, assets under management, and performance highlights for this asset management company.",
-    "Consumer Finance Companies": "Extract relevant data primarily from the Management Discussion and Analysis and Financial Statements. Provide a summary of the company's loan portfolio, interest income, credit quality, and key operational metrics.",
-    "Mortgage REITs": "Concentrate on the Financial Statements and Management Discussion and Analysis. Summarize the REIT's mortgage-backed securities portfolio, net interest income, book value per share, and dividend yield.",
     # Add more instruction prompts as needed
 }

         raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
 def get_embeddings():
+    return HuggingFaceEmbeddings(model_name="sentence-transformers/stsb-roberta-large")
 # Add this at the beginning of your script, after imports
 DOCUMENTS_FILE = "uploaded_documents.json"
     print(f"Final clean response: {final_response[:100]}...")
     return final_response
+class SimpleDDGSearch:
+    def search(self, query: str, num_results: int = 5):
+        results = DDGS().text(query, region='wt-wt', safesearch='off', max_results=num_results)
+        return [res["href"] for res in results]
+class TrafilaturaWebCrawler:
+    def get_website_content_from_url(self, url: str) -> str:
+        try:
+            downloaded = fetch_url(url)
+            if downloaded is None:
+                return f"Failed to fetch content from URL: {url}"
+            result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
+            if result:
+                result_dict = json.loads(result)
+                title = result_dict.get('title', 'No title found')
+                content = result_dict.get('text', 'No content extracted')
+                if content == 'No content extracted':
+                    content = extract(downloaded, include_comments=False)
+                return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
+            else:
+                return f"No content extracted from URL: {url}"
+        except Exception as e:
+            return f"An error occurred while processing {url}: {str(e)}"
 class CitingSources(BaseModel):
     sources: List[str] = Field(
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+        retriever = database.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 most relevant chunks
         # Create a generic query that covers common financial summary topics
         generic_query = "financial performance revenue profit assets liabilities cash flow key metrics highlights"
     else:
         return "No documents available to answer the query."
 def get_response_from_cloudflare(prompt, context, query, num_calls=3, temperature=0.2, search_type="pdf"):
     headers = {
         "Authorization": f"Bearer {API_TOKEN}",
         instruction = f"""Using the following context from the PDF documents:
 {context}
 Write a detailed and complete response that answers the following user question: '{query}'"""
+    else:  # web search
         instruction = f"""Using the following context:
 {context}
 Write a detailed and complete research document that fulfills the following user request: '{query}'
 After writing the document, please provide a list of sources used in your response."""
     inputs = [
         {"role": "system", "content": instruction},
+        {"role": "user", "content": query}
     ]
     payload = {
     return FAISS.from_documents(documents, embed)
+def get_response_with_search(query, model, num_calls=3, temperature=0.2):
+    searcher = SimpleDDGSearch()
+    search_results = searcher.search(query, num_results=5)
+    crawler = TrafilaturaWebCrawler()
+    context = ""
+    for url in search_results:
+        context += crawler.get_website_content_from_url(url) + "\n"
     prompt = f"""Using the following context from web search results:
 {context}
+Write a detailed and complete research document that fulfills the following user request: '{query}'
+After writing the document, please provide a list of sources used in your response."""
     if model == "@cf/meta/llama-3.1-8b-instruct":
         # Use Cloudflare API
         for response in get_response_from_cloudflare(prompt="", context=context, query=query, num_calls=num_calls, temperature=temperature, search_type="web"):
+            yield response, ""  # Yield streaming response without sources
     else:
         # Use Hugging Face API
         client = InferenceClient(model, token=huggingface_token)
+        main_content = ""
         for i in range(num_calls):
             for message in client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
             ):
                 if message.choices and message.choices[0].delta and message.choices[0].delta.content:
                     chunk = message.choices[0].delta.content
+                    main_content += chunk
+                    yield main_content, ""  # Yield partial main content without sources
 INSTRUCTION_PROMPTS = {
+    "Asset Managers": "Summarize the key financial metrics, assets under management, and performance highlights for this asset management company.",
+    "Consumer Finance Companies": "Provide a summary of the company's loan portfolio, interest income, credit quality, and key operational metrics.",
+    "Mortgage REITs": "Summarize the REIT's mortgage-backed securities portfolio, net interest income, book value per share, and dividend yield.",
     # Add more instruction prompts as needed
 }