Update README.md
Browse files
README.md
CHANGED
@@ -70,7 +70,125 @@ generation you should look at model like XXX.
|
|
70 |
|
71 |
class Data:
|
72 |
def_init_(self):
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
### Limitations and bias
|
75 |
In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
|
76 |
Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have
|
|
|
70 |
|
71 |
class Data:
|
72 |
def_init_(self):
|
73 |
+
self.available_databases=['ml_100k', 'ml_1m','jester', 'lda_topics', 'lda_rankings', 'uniform']
|
74 |
+
def show_available_databases(self):
|
75 |
+
print('The avaliable database are:')
|
76 |
+
for i,database in enumerate(self.available_databases):
|
77 |
+
print(str(i)+': '+database)
|
78 |
+
|
79 |
+
def read_data(self,database_name):
|
80 |
+
self.database_name=database_name
|
81 |
+
self.the_data_reader= getattr(self, 'read_'+database_name.lower())
|
82 |
+
self.the_data_reader()
|
83 |
+
|
84 |
+
def read_ml_100k(self):
|
85 |
+
|
86 |
+
from surprise import Dataset
|
87 |
+
data = Dataset.load_builtin('ml-100k')
|
88 |
+
self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
|
89 |
+
self.df.drop(columns=['timestamp'],inplace=True)
|
90 |
+
self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
|
91 |
+
|
92 |
+
def read_ml_1m(self):
|
93 |
+
|
94 |
+
from surprise import Dataset
|
95 |
+
data = Dataset.load_builtin('ml-1m')
|
96 |
+
self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
|
97 |
+
self.df.drop(columns=['timestamp'],inplace=True)
|
98 |
+
self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
|
99 |
+
|
100 |
+
def read_jester(self):
|
101 |
+
|
102 |
+
from surprise import Dataset
|
103 |
+
data = Dataset.load_builtin('jester')
|
104 |
+
self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
|
105 |
+
self.df.drop(columns=['timestamp'],inplace=True)
|
106 |
+
self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
|
107 |
+
|
108 |
+
def read_uniform(self):
|
109 |
+
|
110 |
+
n_users = 20
|
111 |
+
n_ratings = 10000
|
112 |
+
|
113 |
+
import random
|
114 |
+
|
115 |
+
opo = pd.read_csv('../oportunidades.csv')
|
116 |
+
df = [(random.randrange(n_users), random.randrange(len(opo)), random.randrange(1,5)) for i in range(n_ratings)]
|
117 |
+
self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
|
118 |
+
|
119 |
+
def read_lda_topics(self):
|
120 |
+
|
121 |
+
n_users = 20
|
122 |
+
n_ratings = 10000
|
123 |
+
|
124 |
+
import gensim
|
125 |
+
import random
|
126 |
+
import math
|
127 |
+
|
128 |
+
opo = pd.read_csv('../oportunidades_results.csv')
|
129 |
+
# opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
|
130 |
+
|
131 |
+
try:
|
132 |
+
lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
|
133 |
+
except:
|
134 |
+
import generate_users
|
135 |
+
generate_users.gen_model(n_users)
|
136 |
+
lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
|
137 |
+
|
138 |
+
df = []
|
139 |
+
for i in range(n_ratings):
|
140 |
+
opo_n = random.randrange(len(opo))
|
141 |
+
txt = opo.loc[opo_n,'opo_texto']
|
142 |
+
opo_bow = lda_model.id2word.doc2bow(txt.split())
|
143 |
+
topics = lda_model.get_document_topics(opo_bow)
|
144 |
+
topics = {topic[0]:topic[1] for topic in topics}
|
145 |
+
user = random.sample(topics.keys(), 1)[0]
|
146 |
+
rating = math.ceil(topics[user]*5)
|
147 |
+
df.append((user, opo_n, rating))
|
148 |
+
|
149 |
+
self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
|
150 |
+
|
151 |
+
def read_lda_rankings(self):
|
152 |
+
|
153 |
+
n_users = 9
|
154 |
+
n_ratings = 1000
|
155 |
+
|
156 |
+
import gensim
|
157 |
+
import random
|
158 |
+
import math
|
159 |
+
import tqdm
|
160 |
+
|
161 |
+
opo = pd.read_csv('../oportunidades.csv')
|
162 |
+
opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
|
163 |
+
opo.index = range(len(opo))
|
164 |
+
|
165 |
+
path = f'models/output_linkedin_cle_lda_model_{n_users}_topics_symmetric_alpha_auto_beta'
|
166 |
+
lda_model = gensim.models.ldamodel.LdaModel.load(path)
|
167 |
+
|
168 |
+
df = []
|
169 |
+
|
170 |
+
pbar = tqdm.tqdm(total= n_ratings)
|
171 |
+
for i in range(n_ratings):
|
172 |
+
opo_n = random.randrange(len(opo))
|
173 |
+
txt = opo.loc[opo_n,'opo_texto']
|
174 |
+
opo_bow = lda_model.id2word.doc2bow(txt.split())
|
175 |
+
topics = lda_model.get_document_topics(opo_bow)
|
176 |
+
topics = {topic[0]:topic[1] for topic in topics}
|
177 |
+
|
178 |
+
prop = pd.DataFrame([topics], index=['prop']).T.sort_values('prop', ascending=True)
|
179 |
+
prop['rating'] = range(1, len(prop)+1)
|
180 |
+
prop['rating'] = prop['rating']/len(prop)
|
181 |
+
prop['rating'] = prop['rating'].apply(lambda x: math.ceil(x*5))
|
182 |
+
prop.reset_index(inplace=True)
|
183 |
+
|
184 |
+
prop = prop.sample(1)
|
185 |
+
|
186 |
+
df.append((prop['index'].values[0], opo_n, prop['rating'].values[0]))
|
187 |
+
pbar.update(1)
|
188 |
+
|
189 |
+
pbar.close()
|
190 |
+
self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
|
191 |
+
|
192 |
### Limitations and bias
|
193 |
In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
|
194 |
Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have
|