Rubens commited on
Commit
7236a82
1 Parent(s): aceb7cf
Files changed (2) hide show
  1. app.py +271 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pprint
3
+ import tempfile
4
+ from typing import Dict, Text
5
+ import numpy as np
6
+ import tensorflow as tf
7
+ import tensorflow_recommenders as tfrs #scann 1.2.7 + recomm 0.7.0 + TF 2.8.0
8
+ from google.cloud import bigquery ## VERSAO 0.30.0
9
+ import os
10
+ from google.oauth2 import service_account
11
+ import unidecode
12
+ from nltk import word_tokenize
13
+ import re
14
+ import pandas as pd
15
+ from nltk.util import ngrams
16
+ import base64
17
+ import hashlib
18
+ import gradio as gr
19
+ import scann
20
+
21
+
22
+ df=pd.read_csv("/Dubai_translated_best_2500.csv",sep=",",header=0)
23
+
24
+ for i in range(0,len(df['requisito'])):
25
+ print(len(df['requisito'].iloc[i]))
26
+
27
+ df=df.drop_duplicates()
28
+ df=df.dropna()
29
+
30
+ df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
31
+ df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
32
+
33
+ my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
34
+
35
+ my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
36
+
37
+
38
+ ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
39
+ "code": x["code"],
40
+ "nome_vaga": x["nome_vaga"],
41
+ "requisito": tf.strings.split(x["requisito"],maxsplit=106)
42
+ })
43
+
44
+ l=[]
45
+ for x in ratings.as_numpy_iterator():
46
+ pprint.pprint(len(x['requisito']))
47
+ l.append(len(x['requisito']))
48
+
49
+ min(l)
50
+
51
+
52
+ movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
53
+ "code": x["code"],
54
+ "nome_vaga": x["nome_vaga"]
55
+ })
56
+ for x in movies.take(1).as_numpy_iterator():
57
+ pprint.pprint(x)
58
+
59
+ movies = movies.map(lambda x: x["code"])
60
+
61
+
62
+ for x in ratings.take(5).as_numpy_iterator():
63
+ pprint.pprint(x)
64
+
65
+
66
+ for x in movies.take(5).as_numpy_iterator():
67
+ pprint.pprint(x)
68
+
69
+ ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
70
+ "code": x["code"],
71
+ "requisito": tf.strings.split(x["requisito"],maxsplit=106)
72
+ })
73
+
74
+ tf.random.set_seed(42)
75
+ shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
76
+ shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
77
+
78
+ train = shuffled.take(int(df.shape[0]*0.9))
79
+ test = shuffled.take(int(df.shape[0]*0.1))
80
+ cego=shuffled2
81
+
82
+ for x in train.take(1).as_numpy_iterator():
83
+ pprint.pprint(x)
84
+
85
+ for x in test.take(5).as_numpy_iterator():
86
+ pprint.pprint(x)
87
+
88
+
89
+
90
+
91
+ movie_titles = movies#.map(lambda x: x["code"])
92
+ user_ids = ratings.map(lambda x: x["requisito"])
93
+
94
+ xx=[]
95
+ for x in user_ids.as_numpy_iterator():
96
+ try:
97
+ #print(x)
98
+ xx.append(x)
99
+ except:
100
+ pass
101
+
102
+
103
+
104
+ unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
105
+
106
+ unique_user_ids = np.unique(np.concatenate(xx))
107
+
108
+ user_ids=user_ids.batch(int(df.shape[0]*0.9))
109
+
110
+ layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
111
+
112
+ for x in ratings.take(1).as_numpy_iterator():
113
+ pprint.pprint(x['requisito'])
114
+
115
+ for x in ratings.take(5).as_numpy_iterator():
116
+ pprint.pprint(np.array(layer(x['requisito'])))
117
+
118
+ unique_movie_titles[:10]
119
+
120
+ embedding_dimension = 768
121
+
122
+ user_model = tf.keras.Sequential([
123
+ tf.keras.layers.StringLookup(
124
+ vocabulary=unique_user_ids, mask_token=None),
125
+ # We add an additional embedding to account for unknown tokens.
126
+ tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
127
+
128
+ ])
129
+
130
+ for x in train.take(5).as_numpy_iterator():
131
+ pprint.pprint(np.array(user_model(x['requisito'])).shape)
132
+
133
+
134
+ movie_model = tf.keras.Sequential([
135
+ tf.keras.layers.StringLookup(
136
+ vocabulary=unique_movie_titles, mask_token=None),
137
+ tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
138
+ ])
139
+
140
+ for x in train.take(5).as_numpy_iterator():
141
+ pprint.pprint(np.array(movie_model(x['code'])).shape)
142
+
143
+
144
+ metrics = tfrs.metrics.FactorizedTopK(
145
+ candidates=movies.batch(df.shape[0]
146
+ ).map(movie_model)
147
+ )
148
+
149
+ task = tfrs.tasks.Retrieval(
150
+ metrics=metrics
151
+ )
152
+
153
+
154
+ class MovielensModel(tfrs.Model):
155
+
156
+ def __init__(self, user_model, movie_model):
157
+ super().__init__()
158
+ self.movie_model: tf.keras.Model = movie_model
159
+ self.user_model: tf.keras.Model = user_model
160
+ self.task: tf.keras.layers.Layer = task
161
+
162
+ def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
163
+ # We pick out the user features and pass them into the user model.
164
+ user_embeddings = self.user_model(features["requisito"])
165
+ # And pick out the movie features and pass them into the movie model,
166
+ # getting embeddings back.
167
+ positive_movie_embeddings = self.movie_model(features["code"])
168
+
169
+ # The task computes the loss and the metrics.
170
+ return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
171
+
172
+ class NoBaseClassMovielensModel(tf.keras.Model):
173
+
174
+ def __init__(self, user_model, movie_model):
175
+ super().__init__()
176
+ self.movie_model: tf.keras.Model = movie_model
177
+ self.user_model: tf.keras.Model = user_model
178
+ self.task: tf.keras.layers.Layer = task
179
+
180
+ def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
181
+
182
+ # Set up a gradient tape to record gradients.
183
+ with tf.GradientTape() as tape:
184
+
185
+ # Loss computation.
186
+ user_embeddings = self.user_model(features["requisito"])
187
+ positive_movie_embeddings = self.movie_model(features["code"])
188
+ loss = self.task(user_embeddings, positive_movie_embeddings)
189
+
190
+ # Handle regularization losses as well.
191
+ regularization_loss = sum(self.losses)
192
+
193
+ total_loss = loss + regularization_loss
194
+
195
+ gradients = tape.gradient(total_loss, self.trainable_variables)
196
+ self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
197
+
198
+ metrics = {metric.name: metric.result() for metric in self.metrics}
199
+ metrics["loss"] = loss
200
+ metrics["regularization_loss"] = regularization_loss
201
+ metrics["total_loss"] = total_loss
202
+
203
+ return metrics
204
+
205
+ def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
206
+
207
+ # Loss computation.
208
+ user_embeddings = self.user_model(features["requisito"])
209
+ positive_movie_embeddings = self.movie_model(features["code"])
210
+ loss = self.task(user_embeddings, positive_movie_embeddings)
211
+
212
+ # Handle regularization losses as well.
213
+ regularization_loss = sum(self.losses)
214
+
215
+ total_loss = loss + regularization_loss
216
+
217
+ metrics = {metric.name: metric.result() for metric in self.metrics}
218
+ metrics["loss"] = loss
219
+ metrics["regularization_loss"] = regularization_loss
220
+ metrics["total_loss"] = total_loss
221
+
222
+ return metrics
223
+
224
+ model = MovielensModel(user_model, movie_model)
225
+ model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
226
+ cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
227
+
228
+ cached_test = test.batch(int(df.shape[0]*0.1)).cache()
229
+
230
+ path = os.path.join("/", "model/")
231
+
232
+
233
+ cp_callback = tf.keras.callbacks.ModelCheckpoint(
234
+ filepath=path,
235
+ verbose=1,
236
+ save_weights_only=True,
237
+ save_freq=2)
238
+
239
+
240
+ model.fit(cached_train, callbacks=[cp_callback],epochs=200)
241
+
242
+
243
+
244
+ index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
245
+
246
+
247
+ from sklearn.metrics.pairwise import cosine_similarity
248
+
249
+ indice=[]
250
+ for i in range(0,1633):
251
+ indice.append(np.array(index)[i][0])
252
+
253
+
254
+ searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
255
+ num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
256
+ 2, quantize=True).build()
257
+
258
+ def predict(text):
259
+ campos=str(text).lower()
260
+ query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
261
+ neighbors, distances = searcher.search_batched([query])
262
+ xx = df.iloc[neighbors[0],:].nome_vaga
263
+ return xx
264
+
265
+
266
+
267
+ demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
268
+ outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
269
+ css='div {margin-left: auto; margin-right: auto; width: 100%;\
270
+ background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)
271
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ nltk==3.6.5
2
+ pandas==1.3.4
3
+ numpy 1.22.4
4
+ unidecode==1.2.0
5
+ tensorflow==2.9.1
6
+ scann==1.2.7
7
+ tensorflow-recommenders==0.7.0
8
+