unb-lamfo-nlp-mcti
/

NLP-Recommendation-MCTI

English

Recommendation

Model card Files Files and versions Community

Jmilagres commited on Dec 16, 2022

Commit

e23c933

•

1 Parent(s): 56ad836

Update README.md

Browse files

Files changed (1) hide show

README.md +119 -1

README.md CHANGED Viewed

@@ -70,7 +70,125 @@ generation you should look at model like XXX.
 class Data:
   def_init_(self):
 ### Limitations and bias
 In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
 Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have

 class Data:
   def_init_(self):
+    self.available_databases=['ml_100k', 'ml_1m','jester', 'lda_topics', 'lda_rankings', 'uniform']
+   def show_available_databases(self):
+        print('The avaliable database are:')
+        for i,database in enumerate(self.available_databases):
+            print(str(i)+': '+database)
+    def read_data(self,database_name):
+        self.database_name=database_name
+        self.the_data_reader= getattr(self, 'read_'+database_name.lower())
+        self.the_data_reader()
+    def read_ml_100k(self):
+        from surprise import Dataset
+        data = Dataset.load_builtin('ml-100k')
+        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
+        self.df.drop(columns=['timestamp'],inplace=True)
+        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
+    def read_ml_1m(self):
+        from surprise import Dataset
+        data = Dataset.load_builtin('ml-1m')
+        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
+        self.df.drop(columns=['timestamp'],inplace=True)
+        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
+    def read_jester(self):
+        from surprise import Dataset
+        data = Dataset.load_builtin('jester')
+        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
+        self.df.drop(columns=['timestamp'],inplace=True)
+        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
+    def read_uniform(self):
+         n_users = 20
+        n_ratings = 10000
+        import random
+        opo = pd.read_csv('../oportunidades.csv')
+        df = [(random.randrange(n_users), random.randrange(len(opo)), random.randrange(1,5)) for i in range(n_ratings)]
+        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
+    def read_lda_topics(self):
+        n_users = 20
+        n_ratings = 10000
+        import gensim
+        import random
+        import math
+        opo = pd.read_csv('../oportunidades_results.csv')
+        # opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
+        try:
+            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
+        except:
+            import generate_users
+            generate_users.gen_model(n_users)
+            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
+        df = []
+        for i in range(n_ratings):
+            opo_n = random.randrange(len(opo))
+            txt = opo.loc[opo_n,'opo_texto']
+            opo_bow = lda_model.id2word.doc2bow(txt.split())
+            topics = lda_model.get_document_topics(opo_bow)
+            topics = {topic[0]:topic[1] for topic in topics}
+            user = random.sample(topics.keys(), 1)[0]
+            rating = math.ceil(topics[user]*5)
+            df.append((user, opo_n, rating))
+        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
+    def read_lda_rankings(self):
+        n_users = 9
+        n_ratings = 1000
+        import gensim
+        import random
+        import math
+        import tqdm
+        opo = pd.read_csv('../oportunidades.csv')
+        opo = opo.iloc[np.where(opo['opo_brazil']=='Y')]
+        opo.index = range(len(opo))
+        path = f'models/output_linkedin_cle_lda_model_{n_users}_topics_symmetric_alpha_auto_beta'
+        lda_model = gensim.models.ldamodel.LdaModel.load(path)
+        df = []
+        pbar = tqdm.tqdm(total= n_ratings)
+        for i in range(n_ratings):
+            opo_n = random.randrange(len(opo))
+            txt = opo.loc[opo_n,'opo_texto']
+            opo_bow = lda_model.id2word.doc2bow(txt.split())
+            topics = lda_model.get_document_topics(opo_bow)
+            topics = {topic[0]:topic[1] for topic in topics}
+            prop = pd.DataFrame([topics], index=['prop']).T.sort_values('prop', ascending=True)
+            prop['rating'] = range(1, len(prop)+1)
+            prop['rating'] = prop['rating']/len(prop)
+            prop['rating'] = prop['rating'].apply(lambda x: math.ceil(x*5))
+            prop.reset_index(inplace=True)
+            prop = prop.sample(1)
+            df.append((prop['index'].values[0], opo_n, prop['rating'].values[0]))
+            pbar.update(1)
+        pbar.close()
+        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
 ### Limitations and bias
 In this model we have faced some obstacles that we had overcome, but some of those, by the nature of the project, couldn't be totally solved.
 Due the fact that our dataset was build it by ourselves, there was no interaction yet between a user and the dataset, therefore we don't have