rag-movies-app / app.py
Ahmadkhan12's picture
Create app.py
aadb485 verified
raw
history blame
2.56 kB
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import streamlit as st
# Load CSV data using a relative path
csv_file = "Hydra-Movie-Scrape.csv" # Ensure this file is in the same directory as app.py
df = pd.read_csv(csv_file)
# Use 'Summary' or 'Short Summary' as the source for documents
# Fill NaNs with "No summary available."
df['Summary'] = df['Summary'].fillna("No summary available.")
documents = df['Summary'].tolist() # Use 'Summary' for document embeddings
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Create and cache embeddings
@st.cache_resource
def create_embeddings(documents):
embeddings = model.encode(documents, show_progress_bar=True)
return embeddings
# Generate and cache embeddings
doc_embeddings = create_embeddings(documents)
# Convert to NumPy array (FAISS requires float32)
embedding_matrix = np.array(doc_embeddings).astype("float32")
# Build FAISS index for efficient similarity search
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)
# Function to retrieve the most relevant documents
def retrieve(query, top_k=10): # Retrieve up to 10 movies
query_embedding = model.encode(query) # Encode the query
query_vector = np.array(query_embedding).astype("float32")
distances, indices = index.search(np.array([query_vector]), top_k)
return indices[0]
# Streamlit application layout
st.title("Movie Dataset RAG Application")
query = st.text_input("Ask a question about movies:")
if st.button("Submit"):
if query:
indices = retrieve(query)
# Prepare and display detailed responses
response = ""
for idx in indices:
if idx != -1: # Check if the index is valid
movie_details = df.iloc[idx]
response += f"**Title**: {movie_details['Title']}\n"
response += f"**Year**: {movie_details['Year']}\n"
response += f"**Director**: {movie_details['Director']}\n"
response += f"**Cast**: {movie_details['Cast']}\n"
response += f"**Summary**: {movie_details['Summary']}\n\n"
# Formatting the output with clearer separation
if response:
st.write("Here are some movies that match your query:\n")
st.markdown(response) # Use markdown to format the output nicely
else:
st.write("No relevant documents found.")
else:
st.write("Please enter a query.")