English
Recommendation
Jmilagres commited on
Commit
e23c933
1 Parent(s): 56ad836

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +119 -1
README.md CHANGED
@@ -70,7 +70,125 @@ generation you should look at model like XXX.
70
 
71
  class Data:
72
  def_init_(self):
73
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ### Limitations and bias
75
  In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
76
  Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have
 
70
 
71
  class Data:
72
  def_init_(self):
73
+ self.available_databases=['ml_100k', 'ml_1m','jester', 'lda_topics', 'lda_rankings', 'uniform']
74
+ def show_available_databases(self):
75
+ print('The avaliable database are:')
76
+ for i,database in enumerate(self.available_databases):
77
+ print(str(i)+': '+database)
78
+
79
+ def read_data(self,database_name):
80
+ self.database_name=database_name
81
+ self.the_data_reader= getattr(self, 'read_'+database_name.lower())
82
+ self.the_data_reader()
83
+
84
+ def read_ml_100k(self):
85
+
86
+ from surprise import Dataset
87
+ data = Dataset.load_builtin('ml-100k')
88
+ self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
89
+ self.df.drop(columns=['timestamp'],inplace=True)
90
+ self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
91
+
92
+ def read_ml_1m(self):
93
+
94
+ from surprise import Dataset
95
+ data = Dataset.load_builtin('ml-1m')
96
+ self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
97
+ self.df.drop(columns=['timestamp'],inplace=True)
98
+ self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
99
+
100
+ def read_jester(self):
101
+
102
+ from surprise import Dataset
103
+ data = Dataset.load_builtin('jester')
104
+ self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
105
+ self.df.drop(columns=['timestamp'],inplace=True)
106
+ self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
107
+
108
+ def read_uniform(self):
109
+
110
+ n_users = 20
111
+ n_ratings = 10000
112
+
113
+ import random
114
+
115
+ opo = pd.read_csv('../oportunidades.csv')
116
+ df = [(random.randrange(n_users), random.randrange(len(opo)), random.randrange(1,5)) for i in range(n_ratings)]
117
+ self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
118
+
119
+ def read_lda_topics(self):
120
+
121
+ n_users = 20
122
+ n_ratings = 10000
123
+
124
+ import gensim
125
+ import random
126
+ import math
127
+
128
+ opo = pd.read_csv('../oportunidades_results.csv')
129
+ # opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
130
+
131
+ try:
132
+ lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
133
+ except:
134
+ import generate_users
135
+ generate_users.gen_model(n_users)
136
+ lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
137
+
138
+ df = []
139
+ for i in range(n_ratings):
140
+ opo_n = random.randrange(len(opo))
141
+ txt = opo.loc[opo_n,'opo_texto']
142
+ opo_bow = lda_model.id2word.doc2bow(txt.split())
143
+ topics = lda_model.get_document_topics(opo_bow)
144
+ topics = {topic[0]:topic[1] for topic in topics}
145
+ user = random.sample(topics.keys(), 1)[0]
146
+ rating = math.ceil(topics[user]*5)
147
+ df.append((user, opo_n, rating))
148
+
149
+ self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
150
+
151
+ def read_lda_rankings(self):
152
+
153
+ n_users = 9
154
+ n_ratings = 1000
155
+
156
+ import gensim
157
+ import random
158
+ import math
159
+ import tqdm
160
+
161
+ opo = pd.read_csv('../oportunidades.csv')
162
+ opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
163
+ opo.index = range(len(opo))
164
+
165
+ path = f'models/output_linkedin_cle_lda_model_{n_users}_topics_symmetric_alpha_auto_beta'
166
+ lda_model = gensim.models.ldamodel.LdaModel.load(path)
167
+
168
+ df = []
169
+
170
+ pbar = tqdm.tqdm(total= n_ratings)
171
+ for i in range(n_ratings):
172
+ opo_n = random.randrange(len(opo))
173
+ txt = opo.loc[opo_n,'opo_texto']
174
+ opo_bow = lda_model.id2word.doc2bow(txt.split())
175
+ topics = lda_model.get_document_topics(opo_bow)
176
+ topics = {topic[0]:topic[1] for topic in topics}
177
+
178
+ prop = pd.DataFrame([topics], index=['prop']).T.sort_values('prop', ascending=True)
179
+ prop['rating'] = range(1, len(prop)+1)
180
+ prop['rating'] = prop['rating']/len(prop)
181
+ prop['rating'] = prop['rating'].apply(lambda x: math.ceil(x*5))
182
+ prop.reset_index(inplace=True)
183
+
184
+ prop = prop.sample(1)
185
+
186
+ df.append((prop['index'].values[0], opo_n, prop['rating'].values[0]))
187
+ pbar.update(1)
188
+
189
+ pbar.close()
190
+ self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
191
+
192
  ### Limitations and bias
193
  In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
194
  Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have