LMartinezEXEX commited on
Commit
fec7975
1 Parent(s): b2e147d

Enhanced code to use .vec or .bin files instead of json

Browse files

Incorporated english word embeddings (with corresponding examples)
Starting code enhancement with pythons' typing

.gitattributes CHANGED
@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
  data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
35
  data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
36
- data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
 
 
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
  data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
35
  data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
36
+ data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
37
+ data/GoogleNews-vectors-negative300-SLIM.bin filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1 +1,3 @@
1
  __pycache__/
 
 
 
1
  __pycache__/
2
+ bias_tool_logs/
3
+ *.env
app.py CHANGED
@@ -13,11 +13,16 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
13
  # --- Tool config ---
14
  AVAILABLE_LOGS = True # [True | False]
15
  LANGUAGE = "english" # [spanish | english]
16
- EMBEDDING_SUBSET = "fasttext" # [fasttext | mini]
 
17
 
18
  # --- Init classes ---
19
  embedding = Embedding(
20
- subset_name=EMBEDDING_SUBSET
 
 
 
 
21
  )
22
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
23
 
@@ -30,6 +35,7 @@ INTERFACE_LIST = [
30
  wordExplorer_interface(
31
  embedding=embedding,
32
  available_logs=AVAILABLE_LOGS,
 
33
  lang=LANGUAGE),
34
  ]
35
 
 
13
  # --- Tool config ---
14
  AVAILABLE_LOGS = True # [True | False]
15
  LANGUAGE = "english" # [spanish | english]
16
+ EMBEDDINGS_PATH = "data/GoogleNews-vectors-negative300-SLIM.bin"
17
+ MAX_NEIGHBORS = 20
18
 
19
  # --- Init classes ---
20
  embedding = Embedding(
21
+ path=EMBEDDINGS_PATH,
22
+ binary=EMBEDDINGS_PATH.endswith('.bin'),
23
+ limit=100_000,
24
+ randomizedPCA=False,
25
+ max_neighbors=20
26
  )
27
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
28
 
 
35
  wordExplorer_interface(
36
  embedding=embedding,
37
  available_logs=AVAILABLE_LOGS,
38
+ max_neighbors=MAX_NEIGHBORS,
39
  lang=LANGUAGE),
40
  ]
41
 
data/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ data_loader.py
data/{fasttext_embedding_v6.zip → GoogleNews-vectors-negative300-SLIM.bin} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c35f3dda1d216d9baed3fc77f3b6bb51130f07faf0ee418029344635a0b732b7
3
- size 165727812
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046e0921bcb665f50d646b0963fcef8c5abb5f830d0daba8f686e1dffd6ad832
3
+ size 362017275
data/data_loader.py CHANGED
@@ -13,16 +13,12 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
13
  else:
14
  pca = PCA(n_components=2)
15
 
 
16
  model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
17
 
18
  # Cased Vocab
19
- cased_words = model.vocab.keys()
20
-
21
- #Normalized vectors
22
- model.init_sims(replace=True)
23
- cased_emb = [model[word] for word in cased_words]
24
-
25
- # PCA reduction
26
  cased_pca = pca.fit_transform(cased_emb)
27
 
28
  df_cased = pd.DataFrame(
@@ -36,6 +32,6 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
36
 
37
  df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
38
  df_uncased = df_cased.drop_duplicates(subset='word')
39
- df_uncased.to_json(path[:-3] + 'json')
40
 
41
- load_embeddings('./wiki-news-300d-1M.vec', limit=10000)
 
13
  else:
14
  pca = PCA(n_components=2)
15
 
16
+ print("--------> PATH:", path)
17
  model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
18
 
19
  # Cased Vocab
20
+ cased_words = model.index_to_key
21
+ cased_emb = model.get_normed_vectors()
 
 
 
 
 
22
  cased_pca = pca.fit_transform(cased_emb)
23
 
24
  df_cased = pd.DataFrame(
 
32
 
33
  df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
34
  df_uncased = df_cased.drop_duplicates(subset='word')
35
+ return df_uncased
36
 
37
+ #load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)
data/mini_embedding_v6.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fa1594f66f29388719f9125eebdd529054f31bc9564e609d5162ba328a054be
3
- size 94479
 
 
 
 
data/wiki-news-300d-1M.vec DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd4d0ea4f00dbd94ea4948957506f5c6601dd06c54150f898ce1acc15621284b
3
- size 2259088777
 
 
 
 
examples/examples.py CHANGED
@@ -1,69 +1,15 @@
1
- example_fem = {
2
- "mujer": "la mente de una mujer que durante los últimos",
3
- "chica": "enamorado de la misma chica desde la infancia mary",
4
- "ella": "ella llego a la final",
5
- "madre": "su padre y su madre margarita de parma",
6
- "hija": "hija de inmigrantes españoles en",
7
- "femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
8
- }
9
- example_joven = {
10
- "joven": "",
11
- "inmaduro": "",
12
- "niño": "",
13
- "crio": ""
14
- }
15
- example_viejo = {
16
- "viejo": "",
17
- "maduro": "",
18
- "anciano": "",
19
- "adulto": ""
20
- }
21
 
 
 
 
22
 
23
- example_masc = {
24
- "hombre": "deseo innato que todo hombre tiene de comunicar su",
25
- "chico": "fue un chico interesado en artes",
26
- "el": "el parque nacional liwonde",
27
- "padre": "la muerte de su padre en 1832 se formó",
28
- "hijo": "le dice a su hijo aún no nacido como",
29
- "masculino": "el mito es esencialmente masculino y entre las causas",
30
- }
31
-
32
- example_diagnose = {
33
- "ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
34
- "educación": "sentido de vida religión educación y cultura para cada mujer",
35
- "pagado": "un rescate muy grande pagado por sus seguidores a",
36
- "cocinar": "empezó a cocinar una sopa usando",
37
- "lavar": "era directamente usado para lavar ropa por eso la",
38
- "deporte": "se convirtió en el deporte más popular del país",
39
- "ropa": "usan el kimono una ropa tradicional japonesa",
40
- "pelea": "mal por la violenta pelea entre ambos hermanos",
41
- "enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
42
- "ganar": "una necesidad un modo de ganar",
43
- "líder": "del estado en manos del líder opositor henrique capriles para el",
44
- "coser": "realizar tareas domésticas básicas como coser y poner la mesa",
45
- "cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
46
- "cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
47
- "rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
48
- "reina": "año ganó el título de reina de la bahía en el"
49
- }
50
-
51
-
52
- fem_words = ','.join([word for word, context in example_fem.items()])
53
- fem_contexts = ','.join([context for word, context in example_fem.items()])
54
- masc_words = ','.join([word for word, context in example_masc.items()])
55
- masc_contexts = ','.join([context for word, context in example_masc.items()])
56
- young_words = ','.join([word for word, context in example_joven.items()])
57
- old_words = ','.join([word for word, context in example_viejo.items()])
58
- diagnose_words = ','.join([word for word, context in example_diagnose.items()])
59
- diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
60
-
61
- positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
62
- negative_money_words = 'malgastar,derrochar'
63
- diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
64
-
65
- lazy_words = 'vago, perezoso, gandul'
66
- active_words = 'trabajar, esfuerzo, trabajador'
67
 
68
  examples1_explorar_sesgo_en_palabras = [
69
  [fem_words, masc_words, diagnose_words],
@@ -81,42 +27,4 @@ examples2_explorar_sesgo_en_palabras = [
81
  examples_explorar_relaciones_entre_palabras = [
82
  [diagnose_words, fem_words, masc_words, young_words, old_words],
83
  [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
84
- ]
85
-
86
-
87
- # Examples Datos
88
- examples_datos = ["ronronear", "ella", "el", "ellos"]
89
-
90
-
91
- # Examples Sesgos en frases
92
- examples_sesgos_frases = [
93
- ["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
94
- ["los cordobeses bailan *", ""],
95
- ["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
96
- ["",""]
97
- ]
98
-
99
- # Examples Crows_pairs
100
- examples_crows_pairs = [
101
- [ "Los hombres 'musculosos' tienen más suerte con las mujeres",
102
- "Los hombres 'gordos' tienen más suerte con las mujeres",
103
- "Los hombres 'delgados' tienen más suerte con las mujeres",
104
- "Los hombres 'bajos' tienen más suerte con las mujeres",
105
- "Los hombres 'rubios' tienen más suerte con las mujeres",
106
- "Los hombres 'altos' tienen más suerte con las mujeres"],
107
- [ "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
108
- "No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
109
- "","","",""],
110
- [ "Todos sabemos que los 'mapuches' son alcohólicos.",
111
- "Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
112
- "","","",""],
113
- [ "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
114
- "Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
115
- "","","",""],
116
- [ "A 'las viejas' les gusta contar chusmerio",
117
- "A 'los viejos' les gusta contar chusmerio",
118
- "","","",""],
119
- [ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
120
- "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
121
- "","","",""]
122
  ]
 
1
+ fem_words = 'woman,girl,her,mother,daughter,feminine'
2
+ masc_words = 'man,boy,him,father,son,masculine'
3
+ young_words = 'young,immature,kid,child'
4
+ old_words = 'old,mature,elderly,adult'
5
+ diagnose_words = 'education,cook,wash,sports,clothes,fight,nurse,win,leader,saw,nurse,surgeon,king,queen'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ positive_money_words = 'save,economize,administer,manage,business,benefits'
8
+ negative_money_words = 'waste,squander'
9
+ diagnose_money = 'german,australian,argentinian,millionaire,rich,poor'
10
 
11
+ lazy_words = 'lazy, sluggish, slacker'
12
+ active_words = 'active, effort , worker'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  examples1_explorar_sesgo_en_palabras = [
15
  [fem_words, masc_words, diagnose_words],
 
27
  examples_explorar_relaciones_entre_palabras = [
28
  [diagnose_words, fem_words, masc_words, young_words, old_words],
29
  [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  ]
interfaces/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
interfaces/interface_WordExplorer.py CHANGED
@@ -9,7 +9,13 @@ from examples.examples import examples_explorar_relaciones_entre_palabras
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
- def interface(embedding, available_logs, lang="spanish"):
 
 
 
 
 
 
13
  # --- Init logs ---
14
  log_callback = HuggingFaceDatasetSaver(
15
  available_logs=available_logs
@@ -53,10 +59,10 @@ def interface(embedding, available_logs, lang="spanish"):
53
  with gr.Row():
54
  with gr.Row():
55
  gr.Markdown(labels["plotNeighbours"]["title"])
56
- n_neighbors = gr.Slider(minimum=0,maximum=100,step=1,label=labels["plotNeighbours"]["quantity"])
57
  with gr.Row():
58
  alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
59
- fontsize=gr.Number(value=18, label=labels["options"]["font-size"])
60
  with gr.Row():
61
  btn_plot = gr.Button(labels["plot_button"])
62
  with gr.Row():
 
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
+ def interface(
13
+ embedding,
14
+ available_logs: bool,
15
+ max_neighbors: int, # Updated
16
+ lang: str="spanish",
17
+ ) -> gr.Blocks:
18
+
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
  available_logs=available_logs
 
59
  with gr.Row():
60
  with gr.Row():
61
  gr.Markdown(labels["plotNeighbours"]["title"])
62
+ n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
63
  with gr.Row():
64
  alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
65
+ fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
66
  with gr.Row():
67
  btn_plot = gr.Button(labels["plot_button"])
68
  with gr.Row():
modules/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
modules/model_embbeding.py CHANGED
@@ -1,57 +1,126 @@
 
 
 
 
 
 
 
1
  import operator
2
- import numpy as np
3
  import pandas as pd
 
 
4
  from numpy import dot
5
  from gensim import matutils
6
- from modules.module_ann import Ann
7
- from memory_profiler import profile
8
- from sklearn.neighbors import NearestNeighbors
9
 
10
 
11
  class Embedding:
12
  @profile
13
- def __init__(self, subset_name):
14
- # Dataset info
15
- self.ds_subset = subset_name
16
- self.ds_path = f"data/{subset_name}_embedding_v6.zip"
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Pandas dataset
19
  self.ds = None
20
 
21
- # All Words embedding List[List[float]]
22
- self.embedding = None
23
-
24
- # Estimate AproximateNearestNeighbors
25
- self.ann = None
26
 
27
  # Load embedding and pca dataset
28
  self.__load()
29
 
30
- def __contains__(self, word):
31
- return word in self.ds['word'].to_list()
32
-
33
- def __load(self):
34
- print(f"Preparing {self.ds_subset} embedding...")
35
 
36
- # --- Download dataset ---
37
- self.ds = pd.read_json(self.ds_path)
38
 
39
- # --- Get embedding from string
40
- self.embedding = self.ds['embedding'].to_list()
 
 
41
 
42
- # --- Get forest tree to estimate Nearest Neighbors ---
 
43
  self.ann = Ann(
44
  words=self.ds['word'],
45
  vectors=self.ds['embedding'],
46
  coord=self.ds['pca']
47
  )
48
- self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # --- Fit Sklearn NN method ---
51
- self.neigh = NearestNeighbors(n_neighbors=20)
52
- self.neigh.fit(self.embedding)
53
 
54
- def __getValue(self, word, feature):
 
 
 
 
55
  word_id, value = None, None
56
 
57
  if word in self:
@@ -62,30 +131,57 @@ class Embedding:
62
 
63
  return value
64
 
65
- def getEmbedding(self, word):
 
 
 
 
66
  return self.__getValue(word, 'embedding')
67
 
68
- def getPCA(self, word):
 
 
 
 
69
  return self.__getValue(word, 'pca')
70
 
71
- def cosineSimilarities(self, vector_1, vectors_all):
72
- norm = np.linalg.norm(vector_1)
73
- all_norms = np.linalg.norm(vectors_all, axis=1)
74
- dot_products = dot(vectors_all, vector_1)
75
- similarities = dot_products / (norm * all_norms)
76
- return similarities
77
-
78
- def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
 
79
  if nn_method == 'ann':
80
  words = self.ann.get(word, n_neighbors)
 
81
  elif nn_method == 'sklearn':
82
- word_emb = self.getEmbedding(word)
83
- neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
84
- words = operator.itemgetter(*neighbors)(self.ds['word'])
 
85
  else:
86
  words = []
87
  return words
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def getCosineSimilarities(self, w1, w2):
90
  return dot(
91
  matutils.unitvec(self.getEmbedding(w1)),
 
1
+ from modules.module_ann import Ann
2
+ from memory_profiler import profile
3
+ from sklearn.neighbors import NearestNeighbors
4
+ from sklearn.decomposition import PCA
5
+ from gensim.models import KeyedVectors
6
+ from typing import List
7
+ import os
8
  import operator
 
9
  import pandas as pd
10
+
11
+ import numpy as np
12
  from numpy import dot
13
  from gensim import matutils
 
 
 
14
 
15
 
16
  class Embedding:
17
  @profile
18
+ def __init__(self,
19
+ path: str,
20
+ binary: bool,
21
+ limit: int=None,
22
+ randomizedPCA: bool=False,
23
+ max_neighbors: int=20
24
+ ) -> None:
25
+
26
+ # Embedding vars
27
+ self.path = path
28
+ self.limit = limit
29
+ self.randomizedPCA = randomizedPCA
30
+ self.binary = binary
31
+ self.max_neighbors = max_neighbors
32
 
33
+ # Full embedding dataset
34
  self.ds = None
35
 
36
+ # Estimate NearestNeighbors
37
+ self.ann = None # Aproximate with Annoy method
38
+ self.neigh = None # Exact with Sklearn method
 
 
39
 
40
  # Load embedding and pca dataset
41
  self.__load()
42
 
43
+ def __load(
44
+ self,
45
+ ) -> None:
 
 
46
 
47
+ print(f"Preparing {os.path.basename(self.path)} embeddings...")
 
48
 
49
+ # --- Prepare dataset ---
50
+ self.ds = self.__preparate(
51
+ self.path, self.binary, self.limit, self.randomizedPCA
52
+ )
53
 
54
+ # --- Estimate Nearest Neighbors
55
+ # Method A: Througth annoy using forest tree
56
  self.ann = Ann(
57
  words=self.ds['word'],
58
  vectors=self.ds['embedding'],
59
  coord=self.ds['pca']
60
  )
61
+ self.ann.init(
62
+ n_trees=20, metric='dot', n_jobs=-1
63
+ )
64
+
65
+ # Method B: Througth Sklearn method
66
+ self.neigh = NearestNeighbors(
67
+ n_neighbors=self.max_neighbors
68
+ )
69
+ self.neigh.fit(
70
+ X=self.ds['embedding'].to_list()
71
+ )
72
+
73
+ def __preparate(
74
+ self,
75
+ path: str,
76
+ binary: bool,
77
+ limit: int,
78
+ randomizedPCA: bool
79
+ ) -> pd.DataFrame:
80
+
81
+ if randomizedPCA:
82
+ pca = PCA(
83
+ n_components=2,
84
+ copy=False,
85
+ whiten=False,
86
+ svd_solver='randomized',
87
+ iterated_power='auto'
88
+ )
89
+
90
+ else:
91
+ pca = PCA(
92
+ n_components=2
93
+ )
94
+
95
+ model = KeyedVectors.load_word2vec_format(
96
+ fname=path,
97
+ binary=binary,
98
+ limit=limit
99
+ )
100
+
101
+ # Cased Vocab
102
+ cased_words = model.index_to_key
103
+ cased_emb = model.get_normed_vectors()
104
+ cased_pca = pca.fit_transform(cased_emb)
105
+
106
+ df_cased = pd.DataFrame(
107
+ zip(
108
+ cased_words,
109
+ cased_emb,
110
+ cased_pca
111
+ ),
112
+ columns=['word', 'embedding', 'pca']
113
+ )
114
 
115
+ df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
116
+ df_uncased = df_cased.drop_duplicates(subset='word')
117
+ return df_uncased
118
 
119
+ def __getValue(
120
+ self,
121
+ word: str,
122
+ feature: str
123
+ ):
124
  word_id, value = None, None
125
 
126
  if word in self:
 
131
 
132
  return value
133
 
134
+ def getEmbedding(
135
+ self,
136
+ word: str
137
+ ):
138
+
139
  return self.__getValue(word, 'embedding')
140
 
141
+ def getPCA(
142
+ self,
143
+ word: str
144
+ ):
145
+
146
  return self.__getValue(word, 'pca')
147
 
148
+ def getNearestNeighbors(
149
+ self,
150
+ word: str,
151
+ n_neighbors: int=10,
152
+ nn_method: str='sklearn'
153
+ ) -> List[str]:
154
+
155
+ assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
156
+
157
  if nn_method == 'ann':
158
  words = self.ann.get(word, n_neighbors)
159
+
160
  elif nn_method == 'sklearn':
161
+ word_emb = self.getEmbedding(word).reshape(1,-1)
162
+ _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors+1)
163
+ #words = operator.itemgetter(*nn_ids[0])(self.ds['word'].to_list())
164
+ words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
165
  else:
166
  words = []
167
  return words
168
 
169
+ def __contains__(
170
+ self,
171
+ word: str
172
+ ) -> bool:
173
+
174
+ return word in self.ds['word'].to_list()
175
+
176
+ # ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
177
+ # ya que ahora los embedding vienen normalizados
178
+ def cosineSimilarities(self, vector_1, vectors_all):
179
+ norm = np.linalg.norm(vector_1)
180
+ all_norms = np.linalg.norm(vectors_all, axis=1)
181
+ dot_products = dot(vectors_all, vector_1)
182
+ similarities = dot_products / (norm * all_norms)
183
+ return similarities
184
+
185
  def getCosineSimilarities(self, w1, w2):
186
  return dot(
187
  matutils.unitvec(self.getEmbedding(w1)),
modules/module_WordExplorer.py CHANGED
@@ -142,8 +142,8 @@ class WordExplorer:
142
  processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
143
 
144
  if n_neighbors > 0:
145
- neighbors = self.get_neighbors(word,
146
- n_neighbors=n_neighbors+1,
147
  nn_method=kwargs.get('nn_method', 'sklearn')
148
  )
149
  for n in neighbors:
 
142
  processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
143
 
144
  if n_neighbors > 0:
145
+ neighbors = self.get_neighbors(word,
146
+ n_neighbors=n_neighbors,
147
  nn_method=kwargs.get('nn_method', 'sklearn')
148
  )
149
  for n in neighbors: