Spaces:
Sleeping
Sleeping
import json | |
import numpy as np | |
import pandas as pd | |
from sklearn.metrics.pairwise import cosine_similarity | |
class Recommender : | |
def __init__(self, id_col, title_col, content_col, title_vec_col, content_vec_col): | |
self.title_vec_col = title_vec_col | |
self.content_vec_col = content_vec_col | |
self.title_col = title_col | |
self.content_col = content_col | |
self.id_col = id_col | |
def calculate_recom_scores (self, k, similarities) : | |
scores = list(enumerate(similarities[0])) | |
scores = sorted(scores, key=lambda x: x[1], reverse=True) | |
scores = scores[1: k + 1] | |
return scores | |
def str2arr (self, arr) : | |
output = list() | |
for string in arr : | |
data_list = json.loads(string) | |
# Convert the list to a NumPy array | |
data_array = np.array(data_list) | |
output.append(data_array) | |
return np.array(output) | |
# def recommend_k (self, table, k, id) : | |
# data = np.array(list(zip(*table))) | |
# # print(data.shape, data) | |
# idx = int(data[0][data[self.id_col]==id].item()) | |
# titles = self.str2arr(data[self.title_vec_col, :]) | |
# contents = self.str2arr(data[self.content_vec_col, :]) | |
# print(titles.shape) | |
# print(titles[int(idx)].shape) | |
# titles_sim = cosine_similarity(titles[int(idx)].reshape(1, -1), titles) | |
# print(titles_sim.shape) | |
# contents_sim = cosine_similarity(contents[int(idx)].reshape(1, -1), contents) | |
# titles_scores = self.calculate_recom_scores(k, titles_sim) | |
# contents_scores = self.calculate_recom_scores(k, contents_sim) | |
# print(titles_scores) | |
# union_scores = np.union1d(titles_scores, contents_scores) | |
# print(type(union_scores)) | |
# # union_scores = sorted(union_scores.tolist(), key=lambda x: x[1], reverse=True) | |
# union_scores = sorted(union_scores.tolist(), key=lambda x: x[1] if isinstance(x, tuple) else x, reverse=True) | |
# indices = [i[0] for i in union_scores] | |
# result = data[:, np.isin(data[0,:], indices)] | |
# return result[self.id_col, :].tolist() | |
def recommend_k(self, table, k, title): | |
data = np.array(list(zip(*table))) | |
idx = np.where(data == title)[0].tolist()[0] | |
titles = self.str2arr(data[self.title_vec_col, :]) | |
contents = self.str2arr(data[self.content_vec_col, :]) | |
titles_sim = cosine_similarity(titles[int(idx)].reshape(1, -1), titles) | |
contents_sim = cosine_similarity(contents[int(idx)].reshape(1, -1), contents) | |
titles_scores = self.calculate_recom_scores(k, titles_sim) | |
contents_scores = self.calculate_recom_scores(k, contents_sim) | |
# union_scores = np.union1d(titles_scores, contents_scores) | |
union_scores = list(set(titles_scores).union(set(contents_scores))) | |
union_scores = sorted(union_scores, key=lambda x: x[1] if isinstance(x, tuple) else x, reverse=True) | |
# indices = [i[0] if isinstance(i, tuple) else i for i in union_scores] | |
# result = data[:, np.isin(data[0, :], indices)] | |
unique_dict = {} | |
for t in union_scores: | |
if t[0] not in unique_dict or t[1] > unique_dict[t[0]][1]: | |
unique_dict[t[0]] = t | |
union_scores = list(unique_dict.values()) | |
indices = [i[0] for i in union_scores] | |
titles = '\n'.join(table[self.title_col][indices].astype(str)) | |
return indices, union_scores, titles | |