Spaces:
Runtime error
Runtime error
LMartinezEXEX
commited on
Commit
•
fec7975
1
Parent(s):
b2e147d
Enhanced code to use .vec or .bin files instead of json
Browse filesIncorporated english word embeddings (with corresponding examples)
Starting code enhancement with pythons' typing
- .gitattributes +2 -1
- .gitignore +2 -0
- app.py +8 -2
- data/.gitignore +2 -0
- data/{fasttext_embedding_v6.zip → GoogleNews-vectors-negative300-SLIM.bin} +2 -2
- data/data_loader.py +5 -9
- data/mini_embedding_v6.zip +0 -3
- data/wiki-news-300d-1M.vec +0 -3
- examples/examples.py +10 -102
- interfaces/.gitignore +1 -0
- interfaces/interface_WordExplorer.py +9 -3
- modules/.gitignore +1 -0
- modules/model_embbeding.py +138 -42
- modules/module_WordExplorer.py +2 -2
.gitattributes
CHANGED
@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
36 |
-
data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/GoogleNews-vectors-negative300-SLIM.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1 +1,3 @@
|
|
1 |
__pycache__/
|
|
|
|
|
|
1 |
__pycache__/
|
2 |
+
bias_tool_logs/
|
3 |
+
*.env
|
app.py
CHANGED
@@ -13,11 +13,16 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
|
|
13 |
# --- Tool config ---
|
14 |
AVAILABLE_LOGS = True # [True | False]
|
15 |
LANGUAGE = "english" # [spanish | english]
|
16 |
-
|
|
|
17 |
|
18 |
# --- Init classes ---
|
19 |
embedding = Embedding(
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
23 |
|
@@ -30,6 +35,7 @@ INTERFACE_LIST = [
|
|
30 |
wordExplorer_interface(
|
31 |
embedding=embedding,
|
32 |
available_logs=AVAILABLE_LOGS,
|
|
|
33 |
lang=LANGUAGE),
|
34 |
]
|
35 |
|
|
|
13 |
# --- Tool config ---
|
14 |
AVAILABLE_LOGS = True # [True | False]
|
15 |
LANGUAGE = "english" # [spanish | english]
|
16 |
+
EMBEDDINGS_PATH = "data/GoogleNews-vectors-negative300-SLIM.bin"
|
17 |
+
MAX_NEIGHBORS = 20
|
18 |
|
19 |
# --- Init classes ---
|
20 |
embedding = Embedding(
|
21 |
+
path=EMBEDDINGS_PATH,
|
22 |
+
binary=EMBEDDINGS_PATH.endswith('.bin'),
|
23 |
+
limit=100_000,
|
24 |
+
randomizedPCA=False,
|
25 |
+
max_neighbors=20
|
26 |
)
|
27 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
28 |
|
|
|
35 |
wordExplorer_interface(
|
36 |
embedding=embedding,
|
37 |
available_logs=AVAILABLE_LOGS,
|
38 |
+
max_neighbors=MAX_NEIGHBORS,
|
39 |
lang=LANGUAGE),
|
40 |
]
|
41 |
|
data/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
data_loader.py
|
data/{fasttext_embedding_v6.zip → GoogleNews-vectors-negative300-SLIM.bin}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:046e0921bcb665f50d646b0963fcef8c5abb5f830d0daba8f686e1dffd6ad832
|
3 |
+
size 362017275
|
data/data_loader.py
CHANGED
@@ -13,16 +13,12 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
|
|
13 |
else:
|
14 |
pca = PCA(n_components=2)
|
15 |
|
|
|
16 |
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
|
17 |
|
18 |
# Cased Vocab
|
19 |
-
cased_words = model.
|
20 |
-
|
21 |
-
#Normalized vectors
|
22 |
-
model.init_sims(replace=True)
|
23 |
-
cased_emb = [model[word] for word in cased_words]
|
24 |
-
|
25 |
-
# PCA reduction
|
26 |
cased_pca = pca.fit_transform(cased_emb)
|
27 |
|
28 |
df_cased = pd.DataFrame(
|
@@ -36,6 +32,6 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
|
|
36 |
|
37 |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
38 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
39 |
-
df_uncased
|
40 |
|
41 |
-
load_embeddings('
|
|
|
13 |
else:
|
14 |
pca = PCA(n_components=2)
|
15 |
|
16 |
+
print("--------> PATH:", path)
|
17 |
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
|
18 |
|
19 |
# Cased Vocab
|
20 |
+
cased_words = model.index_to_key
|
21 |
+
cased_emb = model.get_normed_vectors()
|
|
|
|
|
|
|
|
|
|
|
22 |
cased_pca = pca.fit_transform(cased_emb)
|
23 |
|
24 |
df_cased = pd.DataFrame(
|
|
|
32 |
|
33 |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
34 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
35 |
+
return df_uncased
|
36 |
|
37 |
+
#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)
|
data/mini_embedding_v6.zip
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6fa1594f66f29388719f9125eebdd529054f31bc9564e609d5162ba328a054be
|
3 |
-
size 94479
|
|
|
|
|
|
|
|
data/wiki-news-300d-1M.vec
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
|
3 |
-
size 2259088777
|
|
|
|
|
|
|
|
examples/examples.py
CHANGED
@@ -1,69 +1,15 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
"hija": "hija de inmigrantes españoles en",
|
7 |
-
"femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
|
8 |
-
}
|
9 |
-
example_joven = {
|
10 |
-
"joven": "",
|
11 |
-
"inmaduro": "",
|
12 |
-
"niño": "",
|
13 |
-
"crio": ""
|
14 |
-
}
|
15 |
-
example_viejo = {
|
16 |
-
"viejo": "",
|
17 |
-
"maduro": "",
|
18 |
-
"anciano": "",
|
19 |
-
"adulto": ""
|
20 |
-
}
|
21 |
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
"chico": "fue un chico interesado en artes",
|
26 |
-
"el": "el parque nacional liwonde",
|
27 |
-
"padre": "la muerte de su padre en 1832 se formó",
|
28 |
-
"hijo": "le dice a su hijo aún no nacido como",
|
29 |
-
"masculino": "el mito es esencialmente masculino y entre las causas",
|
30 |
-
}
|
31 |
-
|
32 |
-
example_diagnose = {
|
33 |
-
"ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
|
34 |
-
"educación": "sentido de vida religión educación y cultura para cada mujer",
|
35 |
-
"pagado": "un rescate muy grande pagado por sus seguidores a",
|
36 |
-
"cocinar": "empezó a cocinar una sopa usando",
|
37 |
-
"lavar": "era directamente usado para lavar ropa por eso la",
|
38 |
-
"deporte": "se convirtió en el deporte más popular del país",
|
39 |
-
"ropa": "usan el kimono una ropa tradicional japonesa",
|
40 |
-
"pelea": "mal por la violenta pelea entre ambos hermanos",
|
41 |
-
"enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
|
42 |
-
"ganar": "una necesidad un modo de ganar",
|
43 |
-
"líder": "del estado en manos del líder opositor henrique capriles para el",
|
44 |
-
"coser": "realizar tareas domésticas básicas como coser y poner la mesa",
|
45 |
-
"cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
|
46 |
-
"cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
|
47 |
-
"rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
|
48 |
-
"reina": "año ganó el título de reina de la bahía en el"
|
49 |
-
}
|
50 |
-
|
51 |
-
|
52 |
-
fem_words = ','.join([word for word, context in example_fem.items()])
|
53 |
-
fem_contexts = ','.join([context for word, context in example_fem.items()])
|
54 |
-
masc_words = ','.join([word for word, context in example_masc.items()])
|
55 |
-
masc_contexts = ','.join([context for word, context in example_masc.items()])
|
56 |
-
young_words = ','.join([word for word, context in example_joven.items()])
|
57 |
-
old_words = ','.join([word for word, context in example_viejo.items()])
|
58 |
-
diagnose_words = ','.join([word for word, context in example_diagnose.items()])
|
59 |
-
diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
|
60 |
-
|
61 |
-
positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
|
62 |
-
negative_money_words = 'malgastar,derrochar'
|
63 |
-
diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
|
64 |
-
|
65 |
-
lazy_words = 'vago, perezoso, gandul'
|
66 |
-
active_words = 'trabajar, esfuerzo, trabajador'
|
67 |
|
68 |
examples1_explorar_sesgo_en_palabras = [
|
69 |
[fem_words, masc_words, diagnose_words],
|
@@ -81,42 +27,4 @@ examples2_explorar_sesgo_en_palabras = [
|
|
81 |
examples_explorar_relaciones_entre_palabras = [
|
82 |
[diagnose_words, fem_words, masc_words, young_words, old_words],
|
83 |
[diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
|
84 |
-
]
|
85 |
-
|
86 |
-
|
87 |
-
# Examples Datos
|
88 |
-
examples_datos = ["ronronear", "ella", "el", "ellos"]
|
89 |
-
|
90 |
-
|
91 |
-
# Examples Sesgos en frases
|
92 |
-
examples_sesgos_frases = [
|
93 |
-
["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
|
94 |
-
["los cordobeses bailan *", ""],
|
95 |
-
["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
|
96 |
-
["",""]
|
97 |
-
]
|
98 |
-
|
99 |
-
# Examples Crows_pairs
|
100 |
-
examples_crows_pairs = [
|
101 |
-
[ "Los hombres 'musculosos' tienen más suerte con las mujeres",
|
102 |
-
"Los hombres 'gordos' tienen más suerte con las mujeres",
|
103 |
-
"Los hombres 'delgados' tienen más suerte con las mujeres",
|
104 |
-
"Los hombres 'bajos' tienen más suerte con las mujeres",
|
105 |
-
"Los hombres 'rubios' tienen más suerte con las mujeres",
|
106 |
-
"Los hombres 'altos' tienen más suerte con las mujeres"],
|
107 |
-
[ "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
|
108 |
-
"No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
|
109 |
-
"","","",""],
|
110 |
-
[ "Todos sabemos que los 'mapuches' son alcohólicos.",
|
111 |
-
"Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
|
112 |
-
"","","",""],
|
113 |
-
[ "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
|
114 |
-
"Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
|
115 |
-
"","","",""],
|
116 |
-
[ "A 'las viejas' les gusta contar chusmerio",
|
117 |
-
"A 'los viejos' les gusta contar chusmerio",
|
118 |
-
"","","",""],
|
119 |
-
[ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
|
120 |
-
"Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
|
121 |
-
"","","",""]
|
122 |
]
|
|
|
1 |
+
fem_words = 'woman,girl,her,mother,daughter,feminine'
|
2 |
+
masc_words = 'man,boy,him,father,son,masculine'
|
3 |
+
young_words = 'young,immature,kid,child'
|
4 |
+
old_words = 'old,mature,elderly,adult'
|
5 |
+
diagnose_words = 'education,cook,wash,sports,clothes,fight,nurse,win,leader,saw,nurse,surgeon,king,queen'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
positive_money_words = 'save,economize,administer,manage,business,benefits'
|
8 |
+
negative_money_words = 'waste,squander'
|
9 |
+
diagnose_money = 'german,australian,argentinian,millionaire,rich,poor'
|
10 |
|
11 |
+
lazy_words = 'lazy, sluggish, slacker'
|
12 |
+
active_words = 'active, effort , worker'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
examples1_explorar_sesgo_en_palabras = [
|
15 |
[fem_words, masc_words, diagnose_words],
|
|
|
27 |
examples_explorar_relaciones_entre_palabras = [
|
28 |
[diagnose_words, fem_words, masc_words, young_words, old_words],
|
29 |
[diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
]
|
interfaces/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
interfaces/interface_WordExplorer.py
CHANGED
@@ -9,7 +9,13 @@ from examples.examples import examples_explorar_relaciones_entre_palabras
|
|
9 |
|
10 |
plt.rcParams.update({'font.size': 14})
|
11 |
|
12 |
-
def interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# --- Init logs ---
|
14 |
log_callback = HuggingFaceDatasetSaver(
|
15 |
available_logs=available_logs
|
@@ -53,10 +59,10 @@ def interface(embedding, available_logs, lang="spanish"):
|
|
53 |
with gr.Row():
|
54 |
with gr.Row():
|
55 |
gr.Markdown(labels["plotNeighbours"]["title"])
|
56 |
-
n_neighbors = gr.Slider(minimum=0,maximum=
|
57 |
with gr.Row():
|
58 |
alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
|
59 |
-
fontsize=gr.Number(value=
|
60 |
with gr.Row():
|
61 |
btn_plot = gr.Button(labels["plot_button"])
|
62 |
with gr.Row():
|
|
|
9 |
|
10 |
plt.rcParams.update({'font.size': 14})
|
11 |
|
12 |
+
def interface(
|
13 |
+
embedding,
|
14 |
+
available_logs: bool,
|
15 |
+
max_neighbors: int, # Updated
|
16 |
+
lang: str="spanish",
|
17 |
+
) -> gr.Blocks:
|
18 |
+
|
19 |
# --- Init logs ---
|
20 |
log_callback = HuggingFaceDatasetSaver(
|
21 |
available_logs=available_logs
|
|
|
59 |
with gr.Row():
|
60 |
with gr.Row():
|
61 |
gr.Markdown(labels["plotNeighbours"]["title"])
|
62 |
+
n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
|
63 |
with gr.Row():
|
64 |
alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
|
65 |
+
fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
|
66 |
with gr.Row():
|
67 |
btn_plot = gr.Button(labels["plot_button"])
|
68 |
with gr.Row():
|
modules/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
modules/model_embbeding.py
CHANGED
@@ -1,57 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import operator
|
2 |
-
import numpy as np
|
3 |
import pandas as pd
|
|
|
|
|
4 |
from numpy import dot
|
5 |
from gensim import matutils
|
6 |
-
from modules.module_ann import Ann
|
7 |
-
from memory_profiler import profile
|
8 |
-
from sklearn.neighbors import NearestNeighbors
|
9 |
|
10 |
|
11 |
class Embedding:
|
12 |
@profile
|
13 |
-
def __init__(self,
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
#
|
19 |
self.ds = None
|
20 |
|
21 |
-
#
|
22 |
-
self.
|
23 |
-
|
24 |
-
# Estimate AproximateNearestNeighbors
|
25 |
-
self.ann = None
|
26 |
|
27 |
# Load embedding and pca dataset
|
28 |
self.__load()
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
|
33 |
-
def __load(self):
|
34 |
-
print(f"Preparing {self.ds_subset} embedding...")
|
35 |
|
36 |
-
|
37 |
-
self.ds = pd.read_json(self.ds_path)
|
38 |
|
39 |
-
# ---
|
40 |
-
self.
|
|
|
|
|
41 |
|
42 |
-
# ---
|
|
|
43 |
self.ann = Ann(
|
44 |
words=self.ds['word'],
|
45 |
vectors=self.ds['embedding'],
|
46 |
coord=self.ds['pca']
|
47 |
)
|
48 |
-
self.ann.init(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
def __getValue(
|
|
|
|
|
|
|
|
|
55 |
word_id, value = None, None
|
56 |
|
57 |
if word in self:
|
@@ -62,30 +131,57 @@ class Embedding:
|
|
62 |
|
63 |
return value
|
64 |
|
65 |
-
def getEmbedding(
|
|
|
|
|
|
|
|
|
66 |
return self.__getValue(word, 'embedding')
|
67 |
|
68 |
-
def getPCA(
|
|
|
|
|
|
|
|
|
69 |
return self.__getValue(word, 'pca')
|
70 |
|
71 |
-
def
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
79 |
if nn_method == 'ann':
|
80 |
words = self.ann.get(word, n_neighbors)
|
|
|
81 |
elif nn_method == 'sklearn':
|
82 |
-
word_emb = self.getEmbedding(word)
|
83 |
-
|
84 |
-
words = operator.itemgetter(*
|
|
|
85 |
else:
|
86 |
words = []
|
87 |
return words
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def getCosineSimilarities(self, w1, w2):
|
90 |
return dot(
|
91 |
matutils.unitvec(self.getEmbedding(w1)),
|
|
|
1 |
+
from modules.module_ann import Ann
|
2 |
+
from memory_profiler import profile
|
3 |
+
from sklearn.neighbors import NearestNeighbors
|
4 |
+
from sklearn.decomposition import PCA
|
5 |
+
from gensim.models import KeyedVectors
|
6 |
+
from typing import List
|
7 |
+
import os
|
8 |
import operator
|
|
|
9 |
import pandas as pd
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
from numpy import dot
|
13 |
from gensim import matutils
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
class Embedding:
|
17 |
@profile
|
18 |
+
def __init__(self,
|
19 |
+
path: str,
|
20 |
+
binary: bool,
|
21 |
+
limit: int=None,
|
22 |
+
randomizedPCA: bool=False,
|
23 |
+
max_neighbors: int=20
|
24 |
+
) -> None:
|
25 |
+
|
26 |
+
# Embedding vars
|
27 |
+
self.path = path
|
28 |
+
self.limit = limit
|
29 |
+
self.randomizedPCA = randomizedPCA
|
30 |
+
self.binary = binary
|
31 |
+
self.max_neighbors = max_neighbors
|
32 |
|
33 |
+
# Full embedding dataset
|
34 |
self.ds = None
|
35 |
|
36 |
+
# Estimate NearestNeighbors
|
37 |
+
self.ann = None # Aproximate with Annoy method
|
38 |
+
self.neigh = None # Exact with Sklearn method
|
|
|
|
|
39 |
|
40 |
# Load embedding and pca dataset
|
41 |
self.__load()
|
42 |
|
43 |
+
def __load(
|
44 |
+
self,
|
45 |
+
) -> None:
|
|
|
|
|
46 |
|
47 |
+
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
|
|
48 |
|
49 |
+
# --- Prepare dataset ---
|
50 |
+
self.ds = self.__preparate(
|
51 |
+
self.path, self.binary, self.limit, self.randomizedPCA
|
52 |
+
)
|
53 |
|
54 |
+
# --- Estimate Nearest Neighbors
|
55 |
+
# Method A: Througth annoy using forest tree
|
56 |
self.ann = Ann(
|
57 |
words=self.ds['word'],
|
58 |
vectors=self.ds['embedding'],
|
59 |
coord=self.ds['pca']
|
60 |
)
|
61 |
+
self.ann.init(
|
62 |
+
n_trees=20, metric='dot', n_jobs=-1
|
63 |
+
)
|
64 |
+
|
65 |
+
# Method B: Througth Sklearn method
|
66 |
+
self.neigh = NearestNeighbors(
|
67 |
+
n_neighbors=self.max_neighbors
|
68 |
+
)
|
69 |
+
self.neigh.fit(
|
70 |
+
X=self.ds['embedding'].to_list()
|
71 |
+
)
|
72 |
+
|
73 |
+
def __preparate(
|
74 |
+
self,
|
75 |
+
path: str,
|
76 |
+
binary: bool,
|
77 |
+
limit: int,
|
78 |
+
randomizedPCA: bool
|
79 |
+
) -> pd.DataFrame:
|
80 |
+
|
81 |
+
if randomizedPCA:
|
82 |
+
pca = PCA(
|
83 |
+
n_components=2,
|
84 |
+
copy=False,
|
85 |
+
whiten=False,
|
86 |
+
svd_solver='randomized',
|
87 |
+
iterated_power='auto'
|
88 |
+
)
|
89 |
+
|
90 |
+
else:
|
91 |
+
pca = PCA(
|
92 |
+
n_components=2
|
93 |
+
)
|
94 |
+
|
95 |
+
model = KeyedVectors.load_word2vec_format(
|
96 |
+
fname=path,
|
97 |
+
binary=binary,
|
98 |
+
limit=limit
|
99 |
+
)
|
100 |
+
|
101 |
+
# Cased Vocab
|
102 |
+
cased_words = model.index_to_key
|
103 |
+
cased_emb = model.get_normed_vectors()
|
104 |
+
cased_pca = pca.fit_transform(cased_emb)
|
105 |
+
|
106 |
+
df_cased = pd.DataFrame(
|
107 |
+
zip(
|
108 |
+
cased_words,
|
109 |
+
cased_emb,
|
110 |
+
cased_pca
|
111 |
+
),
|
112 |
+
columns=['word', 'embedding', 'pca']
|
113 |
+
)
|
114 |
|
115 |
+
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
116 |
+
df_uncased = df_cased.drop_duplicates(subset='word')
|
117 |
+
return df_uncased
|
118 |
|
119 |
+
def __getValue(
|
120 |
+
self,
|
121 |
+
word: str,
|
122 |
+
feature: str
|
123 |
+
):
|
124 |
word_id, value = None, None
|
125 |
|
126 |
if word in self:
|
|
|
131 |
|
132 |
return value
|
133 |
|
134 |
+
def getEmbedding(
|
135 |
+
self,
|
136 |
+
word: str
|
137 |
+
):
|
138 |
+
|
139 |
return self.__getValue(word, 'embedding')
|
140 |
|
141 |
+
def getPCA(
|
142 |
+
self,
|
143 |
+
word: str
|
144 |
+
):
|
145 |
+
|
146 |
return self.__getValue(word, 'pca')
|
147 |
|
148 |
+
def getNearestNeighbors(
|
149 |
+
self,
|
150 |
+
word: str,
|
151 |
+
n_neighbors: int=10,
|
152 |
+
nn_method: str='sklearn'
|
153 |
+
) -> List[str]:
|
154 |
+
|
155 |
+
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
|
156 |
+
|
157 |
if nn_method == 'ann':
|
158 |
words = self.ann.get(word, n_neighbors)
|
159 |
+
|
160 |
elif nn_method == 'sklearn':
|
161 |
+
word_emb = self.getEmbedding(word).reshape(1,-1)
|
162 |
+
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors+1)
|
163 |
+
#words = operator.itemgetter(*nn_ids[0])(self.ds['word'].to_list())
|
164 |
+
words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
|
165 |
else:
|
166 |
words = []
|
167 |
return words
|
168 |
|
169 |
+
def __contains__(
|
170 |
+
self,
|
171 |
+
word: str
|
172 |
+
) -> bool:
|
173 |
+
|
174 |
+
return word in self.ds['word'].to_list()
|
175 |
+
|
176 |
+
# ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
|
177 |
+
# ya que ahora los embedding vienen normalizados
|
178 |
+
def cosineSimilarities(self, vector_1, vectors_all):
|
179 |
+
norm = np.linalg.norm(vector_1)
|
180 |
+
all_norms = np.linalg.norm(vectors_all, axis=1)
|
181 |
+
dot_products = dot(vectors_all, vector_1)
|
182 |
+
similarities = dot_products / (norm * all_norms)
|
183 |
+
return similarities
|
184 |
+
|
185 |
def getCosineSimilarities(self, w1, w2):
|
186 |
return dot(
|
187 |
matutils.unitvec(self.getEmbedding(w1)),
|
modules/module_WordExplorer.py
CHANGED
@@ -142,8 +142,8 @@ class WordExplorer:
|
|
142 |
processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
|
143 |
|
144 |
if n_neighbors > 0:
|
145 |
-
neighbors = self.get_neighbors(word,
|
146 |
-
n_neighbors=n_neighbors
|
147 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
148 |
)
|
149 |
for n in neighbors:
|
|
|
142 |
processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
|
143 |
|
144 |
if n_neighbors > 0:
|
145 |
+
neighbors = self.get_neighbors(word,
|
146 |
+
n_neighbors=n_neighbors,
|
147 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
148 |
)
|
149 |
for n in neighbors:
|