Spaces:

juancho72h
/

maintenance-rmm-demo

Paused

App Files Files Community

juancho72h commited on Oct 2

Commit

ba1509a

•

1 Parent(s): b837587

Upload app.py

Browse files

Files changed (1) hide show

app.py +117 -94

app.py CHANGED Viewed

@@ -2,112 +2,124 @@ import os
 import pinecone
 import openai
 import gradio as gr
-import torch
 from dotenv import load_dotenv
-from pinecone import Pinecone
-from langchain_community.embeddings import HuggingFaceEmbeddings  # Updated import
-from rapidfuzz import fuzz  # Replaced fuzzywuzzy with rapidfuzz
-import logging
-import re  # To help with preprocessing
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Detect GPU availability and set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Running on device: {device}")
-# Suppress specific warning about clean_up_tokenization_spaces
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning, message="clean_up_tokenization_spaces was not set")
 # Load environment variables
 load_dotenv()
-# Access Pinecone and OpenAI API keys from environment variables
-pinecone_api_key = os.getenv("PINECONE_API_KEY")
 openai.api_key = os.getenv("OPENAI_API_KEY")
 index_name = "amtrak-acela-ai-demo"
-# Initialize Pinecone using a class-based method
-pc = Pinecone(api_key=pinecone_api_key)
-# Check if the index exists, if not, create it
-def initialize_pinecone_index(index_name):
     available_indexes = pc.list_indexes().names()
     if index_name not in available_indexes:
-        print(f"Index '{index_name}' does not exist.")
-        # Create the index here if necessary for ZeroGPU usage
     return pc.Index(index_name)
-index = initialize_pinecone_index(index_name)
 # Initialize HuggingFace embedding model
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v4")
-# Initialize chat history manually
-chat_history = []
-# Helper function to preprocess text (removing unnecessary words)
-def preprocess_text(text):
-    # Convert text to lowercase and remove special characters
-    text = re.sub(r'[^\w\s]', '', text.lower())
-    return text.strip()
-# Helper function to recursively flatten any list to a string
-def flatten_to_string(data):
-    if isinstance(data, list):
-        return " ".join([flatten_to_string(item) for item in data])
-    if data is None:
-        return ""
-    return str(data)
-# Function to interact with Pinecone and OpenAI GPT-4
-def get_model_response(human_input):
     try:
-        # Preprocess the human input (cleaning up unnecessary words)
-        processed_input = preprocess_text(human_input)
-        # Embed the query
-        query_embedding = torch.tensor(embedding_model.embed_query(human_input)).to(device)
-        query_embedding = query_embedding.cpu().numpy().tolist()
-        # Query Pinecone index with top_k=5 to get more potential matches
-        search_results = index.query(vector=query_embedding, top_k=5, include_metadata=True)
-        context_list, images = [], []
         for ind, result in enumerate(search_results['matches']):
-            document_content = flatten_to_string(result.get('metadata', {}).get('content', 'No content found'))
-            image_url = flatten_to_string(result.get('metadata', {}).get('image_path', None))
-            figure_desc = flatten_to_string(result.get('metadata', {}).get('figure_description', ''))
-            # Preprocess the figure description and match keywords
-            processed_figure_desc = preprocess_text(figure_desc)
-            similarity_score = fuzz.token_set_ratio(processed_input, processed_figure_desc)
-            logging.info(f"Matching '{processed_input}' with '{processed_figure_desc}', similarity score: {similarity_score}")
-            if similarity_score >= 80:  # Keep the threshold at 80 for now
-                context_list.append(f"Relevant information: {document_content}")
-                if image_url and figure_desc:
-                    images.append((figure_desc, image_url))
         context_string = '\n\n'.join(context_list)
-        # Add user message to chat history
-        chat_history.append({"role": "user", "content": human_input})
-        # Create messages for OpenAI's API
-        messages = [{"role": "system", "content": "You are a helpful assistant."}] + chat_history + [
-            {"role": "system", "content": f"Here is some context:\n{context_string}"},
-            {"role": "user", "content": human_input}
         ]
-        # Validate messages before sending to OpenAI
-        for message in messages:
-            if not isinstance(message, dict) or "role" not in message or "content" not in message:
-                raise ValueError(f"Invalid message format: {message}")
-        # Send the conversation to OpenAI's API
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=messages,
@@ -115,32 +127,43 @@ def get_model_response(human_input):
             temperature=0.5
         )
         output_text = response['choices'][0]['message']['content'].strip()
-        # Add assistant message to chat history
-        chat_history.append({"role": "assistant", "content": output_text})
         return output_text, images
     except Exception as e:
         return f"Error invoking model: {str(e)}", []
-# Function to format text and images for display and track conversation
-def get_model_response_with_images(human_input, history=None):
-    output_text, images = get_model_response(human_input)
-    if images:
-        # Append images in Markdown format for Gradio to render
-        image_output = "".join([f"\n\n**{figure_desc}**\n![{figure_desc}]({image_path})" for figure_desc, image_path in images])
-        return output_text + image_output
     return output_text
-# Set up Gradio interface
 gr_interface = gr.ChatInterface(
-    fn=get_model_response_with_images,
-    title="Maintenance Assistant",
-    description="Ask questions related to the RMMM documents."
 )
-# Ensure ZeroGPU or Hugging Face Spaces handles launching properly
-if __name__ == "__main__":
-    gr_interface.launch()

 import pinecone
 import openai
 import gradio as gr
 from dotenv import load_dotenv
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.docstore.document import Document
+import boto3
 # Load environment variables
 load_dotenv()
+# Access secrets from environment variables
 openai.api_key = os.getenv("OPENAI_API_KEY")
+pinecone_api_key = os.getenv("PINECONE_API_KEY")
+aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
+aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+bucket_name = 'amtrak-superliner-ai-poc'
+txt_file_name = 'combined_extracted_text.txt'
 index_name = "amtrak-acela-ai-demo"
+# Initialize Pinecone using the new class-based method
+pc = pinecone.Pinecone(api_key=pinecone_api_key)
+# Initialize AWS S3 client
+s3_client = boto3.client(
+    's3',
+    aws_access_key_id=aws_access_key,
+    aws_secret_access_key=aws_secret_key,
+    region_name='us-east-1'
+)
+# Initialize Pinecone index (check if it exists, otherwise create it)
+def initialize_pinecone_index(index_name, embedding_dim):
     available_indexes = pc.list_indexes().names()
     if index_name not in available_indexes:
+        pc.create_index(
+            name=index_name,
+            dimension=embedding_dim,
+            metric="cosine",
+            spec=pinecone.ServerlessSpec(
+                cloud="aws",
+                region="us-east-1"
+            )
+        )
     return pc.Index(index_name)
+embedding_dim = 768
+index = initialize_pinecone_index(index_name, embedding_dim)
 # Initialize HuggingFace embedding model
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v4")
+# Download and load text from S3
+def download_text_from_s3(s3_client, bucket_name, file_name):
+    local_txt_path = os.path.join(os.getcwd(), file_name)
+    s3_client.download_file(bucket_name, file_name, local_txt_path)
+    with open(local_txt_path, 'r', encoding='utf-8') as f:
+        return f.read()
+doc_text = download_text_from_s3(s3_client, bucket_name, txt_file_name)
+# Split and embed the document text
+def process_text_into_embeddings(doc_text):
+    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=500)
+    docs = text_splitter.split_documents([Document(page_content=doc_text)])
+    doc_embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
+    return docs, doc_embeddings
+# Check if embeddings already exist in Pinecone
+def check_embeddings_in_pinecone(index):
+    try:
+        stats = index.describe_index_stats()
+        return stats['total_vector_count'] > 0
+    except Exception as e:
+        print(f"Error checking Pinecone index: {e}")
+        return False
+# Only process embeddings if they don't already exist in Pinecone
+if not check_embeddings_in_pinecone(index):
+    split_docs, doc_embeddings = process_text_into_embeddings(doc_text)
+    for i, doc in enumerate(split_docs):
+        metadata = {'content': doc.page_content}
+        index.upsert(vectors=[(str(i), doc_embeddings[i], metadata)])
+else:
+    print("Embeddings already exist in Pinecone. Skipping embedding process.")
+# Query Pinecone and OpenAI GPT-4 to generate a response
+def get_model_response(human_input, chat_history=None):
     try:
+        # Embed the query using the embedding model
+        query_embedding = embedding_model.embed_query(human_input)
+        # Query Pinecone index to retrieve relevant content
+        search_results = index.query(vector=query_embedding, top_k=3, include_metadata=True)
+        # Prepare content and image data
+        context_list = []
+        images = []
+        # Extract the content from Pinecone's search results
         for ind, result in enumerate(search_results['matches']):
+            document_content = result.get('metadata', {}).get('content', 'No content found')
+            image_url = result.get('metadata', {}).get('image_path', None)
+            figure_desc = result.get('metadata', {}).get('figure_description', '')
+            context_list.append(f"Document {ind+1}: {document_content}")
+            if image_url and figure_desc:  # Only append images that exist and have description
+                images.append((figure_desc, image_url))
+        # Combine context from the search results
         context_string = '\n\n'.join(context_list)
+        # Build messages list for OpenAI
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},  # System prompt
+            {"role": "user", "content": f"Here is some context:\n{context_string}\n\nUser's question: {human_input}"}
         ]
+        # Send the conversation to OpenAI's API, using GPT-3.5 instead of GPT-4
         response = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=messages,
             temperature=0.5
         )
+        # Get the model's response
         output_text = response['choices'][0]['message']['content'].strip()
+        # Return both the output and any images found
         return output_text, images
     except Exception as e:
         return f"Error invoking model: {str(e)}", []
+# Function to format text and images for display
+def get_model_response_with_history(human_input, chat_history=None):
+    if chat_history is None:
+        chat_history = []
+    output_text, chat_history = get_model_response(human_input, chat_history)
+    # Handle image display
+    def process_image(image_data):
+        if isinstance(image_data, list):
+            # If a list is passed, flatten it to a string
+            return " ".join(str(item) for item in image_data)
+        return str(image_data)
+    if chat_history:
+        # Ensure that any file/image alt_text is handled correctly
+        for message in chat_history:
+            if "alt_text" in message:
+                message["alt_text"] = process_image(message.get("alt_text", ""))
     return output_text
+# Set up Gradio interface without share=True to avoid the error for now
 gr_interface = gr.ChatInterface(
+    fn=get_model_response_with_history,
+    title="Maintenance Assistant",
+    description="Ask questions related to the RMM documents."
 )
+# Launch the Gradio interface
+gr_interface.launch()