Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Apr 19 14:45:37 2023 | |
@author: Hua | |
""" | |
import pandas as pd | |
import json | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers import util | |
import numpy as np | |
df = pd.read_csv('RAMEmbeddings.csv') | |
# load gpt2 embeddings | |
gpt2bds = df.GPT2Embeddings # get a pd.Series | |
gpt2list = [np.float32(np.array(json.loads(i))) for i in gpt2bds] # list of embeddings | |
# define the search function | |
def search(inputs): | |
# GPT2 embedding | |
gpt2_model = SentenceTransformer('sembeddings/model_gpt_trained') | |
embeddings = gpt2_model.encode(inputs) | |
# calculate the similarity list to a given embedding | |
sims = [] | |
for i in range(len(gpt2list)): | |
sim = util.pytorch_cos_sim(embeddings, gpt2list[i]) | |
sims.append(sim.item()) | |
# find the top-5 similarity items | |
sims_arr = np.array(sims, dtype=object) | |
inds = np.argpartition(sims_arr, -5)[-5:] | |
# return top 5 items | |
return df.loc[inds].reset_index(drop=True) | |