Sentinel-AI-Beta-Test

Running

App Files Files Community

Shreyas094 commited on Jul 26

Commit

a6abb8f

•

1 Parent(s): 1f50701

Update app.py

Browse files

Files changed (1) hide show

app.py +222 -69

app.py CHANGED Viewed

@@ -14,15 +14,26 @@ from llama_parse import LlamaParse
 from langchain_core.documents import Document
 from huggingface_hub import InferenceClient
 import inspect
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "microsoft/Phi-3-mini-4k-instruct"
 ]
 # Initialize LlamaParse
@@ -79,31 +90,71 @@ def update_vectors(files, parser):
 def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
     print(f"Starting generate_chunked_response with {num_calls} calls")
-    client = InferenceClient(model, token=huggingface_token)
     full_response = ""
     messages = [{"role": "user", "content": prompt}]
-    for i in range(num_calls):
-        print(f"Starting API call {i+1}")
-        if should_stop:
-            print("Stop clicked, breaking loop")
-            break
-        try:
-            for message in client.chat_completion(
-                messages=messages,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                stream=True,
-            ):
-                if should_stop:
-                    print("Stop clicked during streaming, breaking")
-                    break
-                if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                    chunk = message.choices[0].delta.content
-                    full_response += chunk
-            print(f"API call {i+1} completed")
-        except Exception as e:
-            print(f"Error in generating response: {str(e)}")
     # Clean up the response
     clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
@@ -144,16 +195,15 @@ def chatbot_interface(message, history, use_web_search, model, temperature, num_
     history = history + [(message, "")]
     try:
-        if use_web_search:
-            for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
-                history[-1] = (message, f"{main_content}\n\n{sources}")
-                yield history
-        else:
-            for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
-                history[-1] = (message, partial_response)
-                yield history
     except gr.CancelledError:
         yield history
 def retry_last_response(history, use_web_search, model, temperature, num_calls):
     if not history:
@@ -165,12 +215,103 @@ def retry_last_response(history, use_web_search, model, temperature, num_calls):
     return chatbot_interface(last_user_msg, history, use_web_search, model, temperature, num_calls)
 def respond(message, history, model, temperature, num_calls, use_web_search):
-    if use_web_search:
-        for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
-            yield f"{main_content}\n\n{sources}"
-    else:
-        for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
-            yield partial_response
 def get_response_with_search(query, model, num_calls=3, temperature=0.2):
     search_results = duckduckgo_search(query)
@@ -181,21 +322,27 @@ def get_response_with_search(query, model, num_calls=3, temperature=0.2):
 {context}
 Write a detailed and complete research document that fulfills the following user request: '{query}'
 After writing the document, please provide a list of sources used in your response."""
-    client = InferenceClient(model, token=huggingface_token)
-    main_content = ""
-    for i in range(num_calls):
-        for message in client.chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=1000,
-            temperature=temperature,
-            stream=True,
-        ):
-            if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                chunk = message.choices[0].delta.content
-                main_content += chunk
-                yield main_content, ""  # Yield partial main content without sources
 def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
     embed = get_embeddings()
@@ -209,24 +356,30 @@ def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
     relevant_docs = retriever.get_relevant_documents(query)
     context_str = "\n".join([doc.page_content for doc in relevant_docs])
-    prompt = f"""Using the following context from the PDF documents:
 {context_str}
 Write a detailed and complete response that answers the following user question: '{query}'"""
-    client = InferenceClient(model, token=huggingface_token)
-    response = ""
-    for i in range(num_calls):
-        for message in client.chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=1000,
-            temperature=temperature,
-            stream=True,
-        ):
-            if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                chunk = message.choices[0].delta.content
-                response += chunk
-                yield response  # Yield partial response
 def vote(data: gr.LikeData):
     if data.liked:
@@ -299,7 +452,7 @@ with demo:
     1. Upload PDF documents using the file input at the top.
     2. Select the PDF parser (pypdf or llamaparse) and click "Upload Document" to update the vector store.
     3. Ask questions in the chat interface.
-    4. Toggle "Use Web Search" to switch between PDF chat and web search, the toggle box is present inside additional inputs dropdown.
     5. Adjust Temperature and Number of API Calls to fine-tune the response generation.
     6. Use the provided examples or ask your own questions.
     """

 from langchain_core.documents import Document
 from huggingface_hub import InferenceClient
 import inspect
+import logging
+# Set up basic configuration for logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
+ACCOUNT_ID = os.environ.get("CLOUDFARE_ACCOUNT_ID")
+API_TOKEN = os.environ.get("CLOUDFLARE_AUTH_TOKEN")
+API_BASE_URL = "https://api.cloudflare.com/client/v4/accounts/a17f03e0f049ccae0c15cdcf3b9737ce/ai/run/"
+print(f"ACCOUNT_ID: {ACCOUNT_ID}")
+print(f"CLOUDFLARE_AUTH_TOKEN: {API_TOKEN[:5]}..." if API_TOKEN else "Not set")
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "@cf/meta/llama-3.1-8b-instruct"
 ]
 # Initialize LlamaParse
 def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
     print(f"Starting generate_chunked_response with {num_calls} calls")
     full_response = ""
     messages = [{"role": "user", "content": prompt}]
+    if model == "@cf/meta/llama-3.1-8b-instruct":
+        # Cloudflare API
+        for i in range(num_calls):
+            print(f"Starting Cloudflare API call {i+1}")
+            if should_stop:
+                print("Stop clicked, breaking loop")
+                break
+            try:
+                response = requests.post(
+                    f"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@cf/meta/llama-3.1-8b-instruct",
+                    headers={"Authorization": f"Bearer {API_TOKEN}"},
+                    json={
+                        "stream": true,
+                        "messages": [
+                            {"role": "system", "content": "You are a friendly assistant"},
+                            {"role": "user", "content": prompt}
+                        ],
+                        "max_tokens": max_tokens,
+                        "temperature": temperature
+                    },
+                    stream=true
+                )
+                for line in response.iter_lines():
+                    if should_stop:
+                        print("Stop clicked during streaming, breaking")
+                        break
+                    if line:
+                        try:
+                            json_data = json.loads(line.decode('utf-8').split('data: ')[1])
+                            chunk = json_data['response']
+                            full_response += chunk
+                        except json.JSONDecodeError:
+                            continue
+                print(f"Cloudflare API call {i+1} completed")
+            except Exception as e:
+                print(f"Error in generating response from Cloudflare: {str(e)}")
+    else:
+        # Original Hugging Face API logic
+        client = InferenceClient(model, token=huggingface_token)
+        for i in range(num_calls):
+            print(f"Starting Hugging Face API call {i+1}")
+            if should_stop:
+                print("Stop clicked, breaking loop")
+                break
+            try:
+                for message in client.chat_completion(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    stream=True,
+                ):
+                    if should_stop:
+                        print("Stop clicked during streaming, breaking")
+                        break
+                    if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                        chunk = message.choices[0].delta.content
+                        full_response += chunk
+                print(f"Hugging Face API call {i+1} completed")
+            except Exception as e:
+                print(f"Error in generating response from Hugging Face: {str(e)}")
     # Clean up the response
     clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
     history = history + [(message, "")]
     try:
+        for response in respond(message, history, model, temperature, num_calls, use_web_search):
+            history[-1] = (message, response)
+            yield history
     except gr.CancelledError:
         yield history
+    except Exception as e:
+        logging.error(f"Unexpected error in chatbot_interface: {str(e)}")
+        history[-1] = (message, f"An unexpected error occurred: {str(e)}")
+        yield history
 def retry_last_response(history, use_web_search, model, temperature, num_calls):
     if not history:
     return chatbot_interface(last_user_msg, history, use_web_search, model, temperature, num_calls)
 def respond(message, history, model, temperature, num_calls, use_web_search):
+    logging.info(f"User Query: {message}")
+    logging.info(f"Model Used: {model}")
+    logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
+    try:
+        if use_web_search:
+            for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
+                response = f"{main_content}\n\n{sources}"
+                first_line = response.split('\n')[0] if response else ''
+                logging.info(f"Generated Response (first line): {first_line}")
+                yield response
+        else:
+            if model == "@cf/meta/llama-3.1-8b-instruct":
+                # Use Cloudflare API
+                embed = get_embeddings()
+                if os.path.exists("faiss_database"):
+                    database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+                    retriever = database.as_retriever()
+                    relevant_docs = retriever.get_relevant_documents(message)
+                    context_str = "\n".join([doc.page_content for doc in relevant_docs])
+                else:
+                    context_str = "No documents available."
+                for partial_response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="pdf"):
+                    first_line = partial_response.split('\n')[0] if partial_response else ''
+                    logging.info(f"Generated Response (first line): {first_line}")
+                    yield partial_response
+            else:
+                # Use Hugging Face API
+                for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
+                    first_line = partial_response.split('\n')[0] if partial_response else ''
+                    logging.info(f"Generated Response (first line): {first_line}")
+                    yield partial_response
+    except Exception as e:
+        logging.error(f"Error with {model}: {str(e)}")
+        if "microsoft/Phi-3-mini-4k-instruct" in model:
+            logging.info("Falling back to Mistral model due to Phi-3 error")
+            fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
+            yield from respond(message, history, fallback_model, temperature, num_calls, use_web_search)
+        else:
+            yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
+logging.basicConfig(level=logging.DEBUG)
+def get_response_from_cloudflare(prompt, context, query, num_calls=3, temperature=0.2, search_type="pdf"):
+    headers = {
+        "Authorization": f"Bearer {API_TOKEN}",
+        "Content-Type": "application/json"
+    }
+    model = "@cf/meta/llama-3.1-8b-instruct"
+    if search_type == "pdf":
+        instruction = f"""Using the following context:
+{context}
+Write a detailed and complete research document that fulfills the following user request: '{query}'"""
+    else:  # web search
+        instruction = f"""Using the following context:
+{context}
+Write a detailed and complete research document that fulfills the following user request: '{query}'
+After writing the document, please provide a list of sources used in your response."""
+    inputs = [
+        {"role": "system", "content": instruction},
+        {"role": "user", "content": query}
+    ]
+    payload = {
+        "messages": inputs,
+        "stream": True,
+        "temperature": temperature
+    }
+    full_response = ""
+    for i in range(num_calls):
+        try:
+            with requests.post(f"{API_BASE_URL}{model}", headers=headers, json=payload, stream=True) as response:
+                if response.status_code == 200:
+                    for line in response.iter_lines():
+                        if line:
+                            try:
+                                json_response = json.loads(line.decode('utf-8').split('data: ')[1])
+                                if 'response' in json_response:
+                                    chunk = json_response['response']
+                                    full_response += chunk
+                                    yield full_response
+                            except (json.JSONDecodeError, IndexError) as e:
+                                logging.error(f"Error parsing streaming response: {str(e)}")
+                                continue
+                else:
+                    logging.error(f"HTTP Error: {response.status_code}, Response: {response.text}")
+                    yield f"I apologize, but I encountered an HTTP error: {response.status_code}. Please try again later."
+        except Exception as e:
+            logging.error(f"Error in generating response from Cloudflare: {str(e)}")
+            yield f"I apologize, but an error occurred: {str(e)}. Please try again later."
+    if not full_response:
+        yield "I apologize, but I couldn't generate a response at this time. Please try again later."
 def get_response_with_search(query, model, num_calls=3, temperature=0.2):
     search_results = duckduckgo_search(query)
 {context}
 Write a detailed and complete research document that fulfills the following user request: '{query}'
 After writing the document, please provide a list of sources used in your response."""
+    if model == "@cf/meta/llama-3.1-8b-instruct":
+        # Use Cloudflare API
+        for response in get_response_from_cloudflare(prompt="", context=context, query=query, num_calls=num_calls, temperature=temperature, search_type="web"):
+            yield response, ""  # Yield streaming response without sources
+    else:
+        # Use Hugging Face API
+        client = InferenceClient(model, token=huggingface_token)
+        main_content = ""
+        for i in range(num_calls):
+            for message in client.chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=1000,
+                temperature=temperature,
+                stream=True,
+            ):
+                if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                    chunk = message.choices[0].delta.content
+                    main_content += chunk
+                    yield main_content, ""  # Yield partial main content without sources
 def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
     embed = get_embeddings()
     relevant_docs = retriever.get_relevant_documents(query)
     context_str = "\n".join([doc.page_content for doc in relevant_docs])
+    if model == "@cf/meta/llama-3.1-8b-instruct":
+        # Use Cloudflare API with the retrieved context
+        for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
+            yield response
+    else:
+        # Use Hugging Face API
+        prompt = f"""Using the following context from the PDF documents:
 {context_str}
 Write a detailed and complete response that answers the following user question: '{query}'"""
+        client = InferenceClient(model, token=huggingface_token)
+        response = ""
+        for i in range(num_calls):
+            for message in client.chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=1000,
+                temperature=temperature,
+                stream=True,
+            ):
+                if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                    chunk = message.choices[0].delta.content
+                    response += chunk
+                    yield response  # Yield partial response
 def vote(data: gr.LikeData):
     if data.liked:
     1. Upload PDF documents using the file input at the top.
     2. Select the PDF parser (pypdf or llamaparse) and click "Upload Document" to update the vector store.
     3. Ask questions in the chat interface.
+    4. Toggle "Use Web Search" to switch between PDF chat and web search.
     5. Adjust Temperature and Number of API Calls to fine-tune the response generation.
     6. Use the provided examples or ask your own questions.
     """