Spaces:
Sleeping
Sleeping
first commit ours
Browse files- Qatar_translated_best_2500.csv +0 -0
- app.py +185 -0
- requirements.txt +9 -0
Qatar_translated_best_2500.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pprint
|
3 |
+
import tempfile
|
4 |
+
from typing import Dict, Text
|
5 |
+
import numpy as np
|
6 |
+
import tensorflow as tf
|
7 |
+
import tensorflow_recommenders as tfrs
|
8 |
+
import os
|
9 |
+
import unidecode
|
10 |
+
from nltk import word_tokenize
|
11 |
+
import re
|
12 |
+
import pandas as pd
|
13 |
+
from nltk.util import ngrams
|
14 |
+
import base64
|
15 |
+
import hashlib
|
16 |
+
import gradio as gr
|
17 |
+
import scann
|
18 |
+
|
19 |
+
df=pd.read_csv("/home/user/app/Qatar_translated_best_2500.csv",sep=",",header=0)
|
20 |
+
df=df.drop_duplicates()
|
21 |
+
df=df.dropna()
|
22 |
+
df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
|
23 |
+
df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
|
24 |
+
my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
|
25 |
+
my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
|
26 |
+
ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
|
27 |
+
"code": x["code"],
|
28 |
+
"nome_vaga": x["nome_vaga"],
|
29 |
+
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
30 |
+
})
|
31 |
+
movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
|
32 |
+
"code": x["code"],
|
33 |
+
"nome_vaga": x["nome_vaga"]
|
34 |
+
})
|
35 |
+
movies = movies.map(lambda x: x["code"])
|
36 |
+
ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
|
37 |
+
"code": x["code"],
|
38 |
+
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
|
39 |
+
})
|
40 |
+
tf.random.set_seed(42)
|
41 |
+
shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
|
42 |
+
shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
|
43 |
+
train = shuffled.take(int(df.shape[0]*0.9))
|
44 |
+
test = shuffled.take(int(df.shape[0]*0.1))
|
45 |
+
cego=shuffled2
|
46 |
+
movie_titles = movies#.map(lambda x: x["code"])
|
47 |
+
user_ids = ratings.map(lambda x: x["requisito"])
|
48 |
+
xx=[]
|
49 |
+
for x in user_ids.as_numpy_iterator():
|
50 |
+
try:
|
51 |
+
xx.append(x)
|
52 |
+
except:
|
53 |
+
pass
|
54 |
+
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
|
55 |
+
unique_user_ids = np.unique(np.concatenate(xx))
|
56 |
+
user_ids=user_ids.batch(int(df.shape[0]*0.9))
|
57 |
+
layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
|
58 |
+
unique_movie_titles[:10]
|
59 |
+
embedding_dimension = 768
|
60 |
+
user_model = tf.keras.Sequential([
|
61 |
+
tf.keras.layers.StringLookup(
|
62 |
+
vocabulary=unique_user_ids, mask_token=None),
|
63 |
+
# We add an additional embedding to account for unknown tokens.
|
64 |
+
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
|
65 |
+
|
66 |
+
])
|
67 |
+
movie_model = tf.keras.Sequential([
|
68 |
+
tf.keras.layers.StringLookup(
|
69 |
+
vocabulary=unique_movie_titles, mask_token=None),
|
70 |
+
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
|
71 |
+
])
|
72 |
+
metrics = tfrs.metrics.FactorizedTopK(
|
73 |
+
candidates=movies.batch(df.shape[0]
|
74 |
+
).map(movie_model)
|
75 |
+
)
|
76 |
+
task = tfrs.tasks.Retrieval(
|
77 |
+
metrics=metrics
|
78 |
+
)
|
79 |
+
class MovielensModel(tfrs.Model):
|
80 |
+
|
81 |
+
def __init__(self, user_model, movie_model):
|
82 |
+
super().__init__()
|
83 |
+
self.movie_model: tf.keras.Model = movie_model
|
84 |
+
self.user_model: tf.keras.Model = user_model
|
85 |
+
self.task: tf.keras.layers.Layer = task
|
86 |
+
|
87 |
+
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
|
88 |
+
user_embeddings = self.user_model(features["requisito"])
|
89 |
+
positive_movie_embeddings = self.movie_model(features["code"])
|
90 |
+
return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
|
91 |
+
|
92 |
+
class NoBaseClassMovielensModel(tf.keras.Model):
|
93 |
+
|
94 |
+
def __init__(self, user_model, movie_model):
|
95 |
+
super().__init__()
|
96 |
+
self.movie_model: tf.keras.Model = movie_model
|
97 |
+
self.user_model: tf.keras.Model = user_model
|
98 |
+
self.task: tf.keras.layers.Layer = task
|
99 |
+
|
100 |
+
def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
101 |
+
|
102 |
+
with tf.GradientTape() as tape:
|
103 |
+
|
104 |
+
user_embeddings = self.user_model(features["requisito"])
|
105 |
+
positive_movie_embeddings = self.movie_model(features["code"])
|
106 |
+
loss = self.task(user_embeddings, positive_movie_embeddings)
|
107 |
+
|
108 |
+
regularization_loss = sum(self.losses)
|
109 |
+
|
110 |
+
total_loss = loss + regularization_loss
|
111 |
+
|
112 |
+
gradients = tape.gradient(total_loss, self.trainable_variables)
|
113 |
+
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
|
114 |
+
|
115 |
+
metrics = {metric.name: metric.result() for metric in self.metrics}
|
116 |
+
metrics["loss"] = loss
|
117 |
+
metrics["regularization_loss"] = regularization_loss
|
118 |
+
metrics["total_loss"] = total_loss
|
119 |
+
|
120 |
+
return metrics
|
121 |
+
|
122 |
+
def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
|
123 |
+
|
124 |
+
user_embeddings = self.user_model(features["requisito"])
|
125 |
+
positive_movie_embeddings = self.movie_model(features["code"])
|
126 |
+
loss = self.task(user_embeddings, positive_movie_embeddings)
|
127 |
+
|
128 |
+
regularization_loss = sum(self.losses)
|
129 |
+
|
130 |
+
total_loss = loss + regularization_loss
|
131 |
+
|
132 |
+
metrics = {metric.name: metric.result() for metric in self.metrics}
|
133 |
+
metrics["loss"] = loss
|
134 |
+
metrics["regularization_loss"] = regularization_loss
|
135 |
+
metrics["total_loss"] = total_loss
|
136 |
+
|
137 |
+
return metrics
|
138 |
+
|
139 |
+
model = MovielensModel(user_model, movie_model)
|
140 |
+
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
|
141 |
+
cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
|
142 |
+
cached_test = test.batch(int(df.shape[0]*0.15)).cache()
|
143 |
+
path = os.path.join("/home/user/app/", "model/")
|
144 |
+
cp_callback = tf.keras.callbacks.ModelCheckpoint(
|
145 |
+
filepath=path,
|
146 |
+
verbose=1,
|
147 |
+
save_weights_only=True,
|
148 |
+
save_freq=2)
|
149 |
+
|
150 |
+
model.fit(cached_train, callbacks=[cp_callback],epochs=110)
|
151 |
+
|
152 |
+
index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
|
153 |
+
|
154 |
+
indice=[]
|
155 |
+
for i in range(0,1633):
|
156 |
+
indice.append(np.array(index)[i][0])
|
157 |
+
|
158 |
+
searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
|
159 |
+
num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
|
160 |
+
2, quantize=True).build()
|
161 |
+
|
162 |
+
import matplotlib.pyplot as plt
|
163 |
+
|
164 |
+
def predict(text):
|
165 |
+
campos=str(text).lower()
|
166 |
+
query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
|
167 |
+
neighbors, distances = searcher.search_batched([query])
|
168 |
+
xx = df.iloc[neighbors[0],:].nome_vaga
|
169 |
+
fig = plt.figure(figsize=(14,9))
|
170 |
+
plt.bar(list(xx),distances[0]*0.8*10)
|
171 |
+
plt.title('Degree of match')
|
172 |
+
plt.xlabel('Labels')
|
173 |
+
plt.xticks(rotation=270)
|
174 |
+
|
175 |
+
plt.ylabel('Distances')
|
176 |
+
for x, y in zip(list(range(0,10)),distances[0]*0.8*10):
|
177 |
+
plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black')
|
178 |
+
return xx, fig
|
179 |
+
|
180 |
+
demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
|
181 |
+
outputs=[gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
|
182 |
+
gr.Plot()],\
|
183 |
+
css='div {margin-left: auto; margin-right: auto; width: 100%;\
|
184 |
+
background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\
|
185 |
+
.launch(share=False)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk==3.6.5
|
2 |
+
pandas==1.3.4
|
3 |
+
numpy==1.22.4
|
4 |
+
matplotlib==3.4.3
|
5 |
+
unidecode==1.2.0
|
6 |
+
tensorflow==2.9.1
|
7 |
+
scann==1.2.7
|
8 |
+
tensorflow-recommenders==0.7.0
|
9 |
+
|