Spaces:
Runtime error
Runtime error
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import streamlit as st | |
# Load CSV data using a relative path | |
csv_file = "Hydra-Movie-Scrape.csv" # Ensure this file is in the same directory as app.py | |
df = pd.read_csv(csv_file) | |
# Use 'Summary' or 'Short Summary' as the source for documents | |
# Fill NaNs with "No summary available." | |
df['Summary'] = df['Summary'].fillna("No summary available.") | |
documents = df['Summary'].tolist() # Use 'Summary' for document embeddings | |
# Initialize the SentenceTransformer model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Create and cache embeddings | |
def create_embeddings(documents): | |
embeddings = model.encode(documents, show_progress_bar=True) | |
return embeddings | |
# Generate and cache embeddings | |
doc_embeddings = create_embeddings(documents) | |
# Convert to NumPy array (FAISS requires float32) | |
embedding_matrix = np.array(doc_embeddings).astype("float32") | |
# Build FAISS index for efficient similarity search | |
index = faiss.IndexFlatL2(embedding_matrix.shape[1]) | |
index.add(embedding_matrix) | |
# Function to retrieve the most relevant documents | |
def retrieve(query, top_k=10): # Retrieve up to 10 movies | |
query_embedding = model.encode(query) # Encode the query | |
query_vector = np.array(query_embedding).astype("float32") | |
distances, indices = index.search(np.array([query_vector]), top_k) | |
return indices[0] | |
# Streamlit application layout | |
st.title("Movie Dataset RAG Application") | |
query = st.text_input("Ask a question about movies:") | |
if st.button("Submit"): | |
if query: | |
indices = retrieve(query) | |
# Prepare and display detailed responses | |
response = "" | |
for idx in indices: | |
if idx != -1: # Check if the index is valid | |
movie_details = df.iloc[idx] | |
response += f"**Title**: {movie_details['Title']}\n" | |
response += f"**Year**: {movie_details['Year']}\n" | |
response += f"**Director**: {movie_details['Director']}\n" | |
response += f"**Cast**: {movie_details['Cast']}\n" | |
response += f"**Summary**: {movie_details['Summary']}\n\n" | |
# Formatting the output with clearer separation | |
if response: | |
st.write("Here are some movies that match your query:\n") | |
st.markdown(response) # Use markdown to format the output nicely | |
else: | |
st.write("No relevant documents found.") | |
else: | |
st.write("Please enter a query.") | |