Spaces:
Sleeping
Sleeping
import faiss | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import torch.nn.functional as F | |
from sentence_transformers import SentenceTransformer | |
DIM = 768 | |
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) | |
print("Model loaded successfully") | |
papers_df = pd.read_csv("data/cvpr2024_papers_with_details.csv", index_col=None, on_bad_lines='skip') | |
papers_df = papers_df[~papers_df["summary"].isna() & ~papers_df["pdf_path"].isna()] | |
print("Data loaded successfully") | |
with open('data/embeddings.npy', 'rb') as f: | |
embeddings = np.load(f) | |
index = faiss.IndexFlatL2(DIM) | |
index.add(embeddings) | |
print("Index loaded successfully") | |
def encode_query(query): | |
query_embeddings = model.encode([query], convert_to_tensor=True) | |
query_embeddings = F.layer_norm(query_embeddings, normalized_shape=(query_embeddings.shape[1],)) | |
query_embeddings = query_embeddings[:, :DIM] | |
query_embeddings = F.normalize(query_embeddings, p=2, dim=1) | |
return query_embeddings | |
def search_nearest_papers(query, k=5): | |
query_embeddings = encode_query(query) | |
D, I = index.search(query_embeddings, k) | |
return papers_df.iloc[I[0]][["Title", "arXiv_link"]] | |
demo = gr.Interface( | |
search_nearest_papers, | |
[ | |
"text", | |
gr.Slider(1, 10, value=5), | |
], | |
gr.Dataframe( | |
headers=["Title", "PDF"], | |
), | |
title="CVPR 2024 Paper Search", | |
description="Semantic search over CPVR 2024 paper summary. This app was made using the data available on https://github.com/harpreetsahota204/CVPR-2024-Papers.", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |