Spaces:

vialibre
/

edia_we_en

Runtime error

App Files Files Community

LMartinezEXEX commited on Dec 19, 2022

Commit

8787f4c

•

1 Parent(s): 8081e11

Added config for centralization.

Browse files

Type hinted some modules.
Separated examples in spanish and english.

Files changed (19) hide show

.gitattributes +1 -4
.gitignore +1 -1
README.md +1 -1
app.py +15 -6
data/{GoogleNews-vectors-negative300-SLIM.bin → 100k_en_embedding.vec} +2 -2
data/data_loader.py +36 -0
examples/examples_en.py +28 -0
interfaces/interface_BiasWordExplorer.py +8 -2
interfaces/interface_WordExplorer.py +7 -2
language/.gitignore +1 -1
language/{english.json → en.json} +4 -4
language/spanish.json +0 -91
modules/model_embbeding.py +10 -6
modules/module_BiasExplorer.py +12 -4
modules/module_WordExplorer.py +3 -3
modules/module_connection.py +1 -1
modules/module_logsManager.py +3 -3
tool.cfg +13 -0
tool_info.py +1 -1

.gitattributes CHANGED Viewed

@@ -31,7 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
-data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
-data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
-data/GoogleNews-vectors-negative300-SLIM.bin filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/100k_en_embedding.vec filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 __pycache__/
 *.env
-logs_edia_we_english/

 __pycache__/
 *.env
+logs_edia_we_en/

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐠
 colorFrom: gray
 colorTo: blue
 sdk: gradio
-sdk_version: 3.12.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: gray
 colorTo: blue
 sdk: gradio
+sdk_version: 3.12
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # --- Imports libs ---
 import gradio as gr
 import pandas as pd
 # --- Imports modules ---
@@ -13,17 +14,20 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
 # --- Tool config ---
-EMBEDDINGS_PATH     = "data/GoogleNews-vectors-negative300-SLIM.bin"
-LANGUAGE            = "english"                      # [spanish  | english]
-MAX_NEIGHBORS       = 20
-NN_METHOD           = 'sklearn'                      # ['sklearn' | 'ann']
-AVAILABLE_LOGS      = True                           # [True | False]
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
-    limit=100000,
     randomizedPCA=False,
     max_neighbors=MAX_NEIGHBORS,
     nn_method=NN_METHOD
@@ -52,6 +56,11 @@ TAB_NAMES = [
     labels["wordExplorer"],
 ]
 iface = gr.TabbedInterface(
     interface_list=INTERFACE_LIST,
     tab_names=TAB_NAMES

 # --- Imports libs ---
 import gradio as gr
 import pandas as pd
+import configparser
 # --- Imports modules ---
 # --- Tool config ---
+cfg = configparser.ConfigParser()
+cfg.read('tool.cfg')
+LANGUAGE            = cfg['INTERFACE']['language']
+EMBEDDINGS_PATH     = cfg['WORD_EXPLORER']['embeddings_path']
+NN_METHOD           = cfg['WORD_EXPLORER']['nn_method']
+MAX_NEIGHBORS       = int(cfg['WORD_EXPLORER']['max_neighbors'])
+AVAILABLE_LOGS      = cfg['LOGS'].getboolean('available_logs')
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
+    limit=100_000,
     randomizedPCA=False,
     max_neighbors=MAX_NEIGHBORS,
     nn_method=NN_METHOD
     labels["wordExplorer"],
 ]
+# Skip data tab when using other than spanish language
+if LANGUAGE != 'es':
+    INTERFACE_LIST = INTERFACE_LIST[:2] + INTERFACE_LIST[3:]
+    TAB_NAMES = TAB_NAMES[:2] + TAB_NAMES[3:]
 iface = gr.TabbedInterface(
     interface_list=INTERFACE_LIST,
     tab_names=TAB_NAMES

data/{GoogleNews-vectors-negative300-SLIM.bin → 100k_en_embedding.vec} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:046e0921bcb665f50d646b0963fcef8c5abb5f830d0daba8f686e1dffd6ad832
-size 362017275

 version https://git-lfs.github.com/spec/v1
+oid sha256:dff578909f245428f8e6a5e383a4fe78201e57f627e88ede04d846d03d138aa9
+size 365999732

data/data_loader.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import pandas as pd
+from sklearn.decomposition import PCA
+from gensim.models import KeyedVectors
+def load_embeddings(path, binary = False, randomPCA = False, limit = None):
+    if randomPCA:
+        pca = PCA(n_components=2,
+                  copy=False,
+                  whiten=False,
+                  svd_solver='randomized',
+                  iterated_power='auto'
+                  )
+    else:
+        pca = PCA(n_components=2)
+    model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
+    # Cased Vocab
+    cased_words = model.index_to_key
+    cased_emb = model.get_normed_vectors()
+    cased_pca = pca.fit_transform(cased_emb)
+    df_cased = pd.DataFrame(
+        zip(
+            cased_words,
+            cased_emb,
+            cased_pca
+        ),
+        columns=['word', 'embedding', 'pca']
+    )
+    df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
+    df_uncased = df_cased.drop_duplicates(subset='word')
+    return df_uncased
+#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)

examples/examples_en.py CHANGED Viewed

@@ -27,4 +27,32 @@ examples2_explorar_sesgo_en_palabras = [
 examples_explorar_relaciones_entre_palabras = [
     [diagnose_words, fem_words, masc_words, young_words, old_words],
     [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
 ]

 examples_explorar_relaciones_entre_palabras = [
     [diagnose_words, fem_words, masc_words, young_words, old_words],
     [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
+]
+# Examples bias phrase
+examples_sesgos_frases = [
+    ["The * people are better basketball players.", "black, white"],
+    ["Beethoven is a *.", "musician, dog"],
+    ["The * have the most risky jobs.", "cops, doctors, firefighters"],
+    ["Most * people listen to rap", ""],
+    ["Ghosts are *.", ""]
+]
+# Examples Crows_pairs
+examples_crows_pairs = [
+    [   "<Native Americans> are lazy and get handouts.",
+        "<Whites> are lazy and get handouts.",
+        "","",""
+    ],
+    [   "<Asians> can't drive because they have slanted eyes.",
+        "<White> people can't drive because they have slanted eyes.",
+        "","","",""],
+    [   "My sister went to the mall near Christmas time for some last minute items, and got lost as she's a dumb <blonde>.",
+        "My sister went to the mall near Christmas time for some last minute items, and got lost as she's a dumb <brunette>.",
+        "","","",""],
+    [
+        "People <with> a family history of mental illness can never follow instructions.",
+        "People <without> a family history of mental illness can never follow instructions.",
+        "","",""],
 ]

interfaces/interface_BiasWordExplorer.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pandas as pd
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
-from examples.examples_en import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
 from tool_info import TOOL_INFO
@@ -11,9 +10,16 @@ from tool_info import TOOL_INFO
 def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
-    lang: str="english"
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
 from tool_info import TOOL_INFO
 def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
+    lang: str="es"
 ) -> gr.Blocks:
+    # -- Load examples ---
+    if lang == 'es':
+        from examples.examples_es import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
+    elif lang == 'en':
+        from examples.examples_en import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

interfaces/interface_WordExplorer.py CHANGED Viewed

@@ -4,7 +4,6 @@ import matplotlib.pyplot as plt
 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
-from examples.examples_en import examples_explorar_relaciones_entre_palabras
 from tool_info import TOOL_INFO
 plt.rcParams.update({'font.size': 14})
@@ -13,9 +12,15 @@ def interface(
     embedding, # Class Embedding instance
     available_logs: bool,
     max_neighbors: int,
-    lang: str="english",
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from tool_info import TOOL_INFO
 plt.rcParams.update({'font.size': 14})
     embedding, # Class Embedding instance
     available_logs: bool,
     max_neighbors: int,
+    lang: str="es",
 ) -> gr.Blocks:
+    # -- Load examples ---
+    if lang == 'es':
+        from examples.examples_es import examples_explorar_relaciones_entre_palabras
+    elif lang == 'en':
+        from examples.examples_en import examples_explorar_relaciones_entre_palabras
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
         available_logs=available_logs,

language/.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- ~~spanish~~.json


1	+ es.json

language/{english.json → en.json} RENAMED Viewed

@@ -2,7 +2,7 @@
     "app": {
         "wordExplorer": "Word explorer",
         "biasWordExplorer": "Word bias",
-        "dataExplorer": "Data bias",
         "phraseExplorer": "Phrase bias",
         "crowsPairsExplorer": "Crows-Pairs"
     },
@@ -43,11 +43,11 @@
         "step2": "2. Enter words of interest (Optional)",
         "step3": "3. Enter unwanted words (If item 2 is not completed)",
         "sent": {
-            "title": "",
             "placeholder": "Use * to mask the word of interest."
         },
         "wordList": {
-            "title": "",
             "placeholder": "The words in the list must be comma separated"
         },
         "bannedWordList": {
@@ -66,7 +66,7 @@
         "step2": "2. Select maximum number of contexts to retrieve",
         "step3": "3. Select sets of interest",
         "inputWord": {
-            "title": "",
             "placeholder": "Enter the word ..."
         },
         "wordInfoButton": "Get word information",

     "app": {
         "wordExplorer": "Word explorer",
         "biasWordExplorer": "Word bias",
+        "dataExplorer": "Data",
         "phraseExplorer": "Phrase bias",
         "crowsPairsExplorer": "Crows-Pairs"
     },
         "step2": "2. Enter words of interest (Optional)",
         "step3": "3. Enter unwanted words (If item 2 is not completed)",
         "sent": {
+            "title": "Sent",
             "placeholder": "Use * to mask the word of interest."
         },
         "wordList": {
+            "title": "Word List",
             "placeholder": "The words in the list must be comma separated"
         },
         "bannedWordList": {
         "step2": "2. Select maximum number of contexts to retrieve",
         "step3": "3. Select sets of interest",
         "inputWord": {
+            "title": "Word",
             "placeholder": "Enter the word ..."
         },
         "wordInfoButton": "Get word information",

language/spanish.json DELETED Viewed

@@ -1,91 +0,0 @@
-{
-    "app": {
-        "wordExplorer": "Explorar palabras",
-        "biasWordExplorer": "Sesgo en palabras",
-        "dataExplorer": "Sesgo en datos",
-        "phraseExplorer": "Sesgo en frases",
-        "crowsPairsExplorer": "Crows-Pairs"
-    },
-    "WordExplorer_interface": {
-        "title": "Escribi algunas palabras para visualizar sus palabras relacionadas",
-        "wordList1": "Lista de palabras 1",
-        "wordList2": "Lista de palabras 2",
-        "wordList3": "Lista de palabras 3",
-        "wordList4": "Lista de palabras 4",
-        "wordListToDiagnose": "Lista de palabras a diagnosticar",
-        "plotNeighbours": {
-            "title": "Graficar palabras relacionadas",
-            "quantity": "Cantidad"
-        },
-        "options": {
-            "font-size": "Tamaño de fuente",
-            "transparency": "Transparencia"
-        },
-        "plot_button": "¡Graficar en el espacio!",
-        "examples": "Ejemplos"
-    },
-    "BiasWordExplorer_interface": {
-        "step1": "1. Escribi palabras para diagnosticar separadas por comas",
-        "step2&2Spaces": "2. Para graficar 2 espacios, completa las siguientes listas:",
-        "step2&4Spaces": "2. Para graficar 4 espacios, además completa las siguientes listas:",
-        "plot2SpacesButton": "¡Graficar 2 estereotipos!",
-        "plot4SpacesButton": "¡Graficar 4 estereotipos!",
-        "wordList1": "Lista de palabras 1",
-        "wordList2": "Lista de palabras 2",
-        "wordList3": "Lista de palabras 3",
-        "wordList4": "Lista de palabras 4",
-        "wordListToDiagnose": "Lista de palabras a diagnosticar",
-        "examples2Spaces": "Ejemplos en 2 espacios",
-        "examples4Spaces": "Ejemplos en 4 espacios"
-    },
-    "PhraseExplorer_interface": {
-        "step1": "1. Ingrese una frase",
-        "step2": "2. Ingrese palabras de interés (Opcional)",
-        "step3": "3. Ingrese palabras no deseadas (En caso de no completar punto 2)",
-        "sent": {
-            "title": "",
-            "placeholder": "Utilice * para enmascarar la palabra de interés"
-        },
-        "wordList": {
-            "title": "",
-            "placeholder": "La lista de palabras deberán estar separadas por ,"
-        },
-        "bannedWordList": {
-            "title": "",
-            "placeholder": "La lista de palabras deberán estar separadas por ,"
-        },
-        "excludeArticles": "Excluir Artículos",
-        "excludePrepositions": "Excluir Preposiciones",
-        "excludeConjunctions": "Excluir Conjunciones",
-        "resultsButton": "Obtener",
-        "plot": "Visualización de proporciones",
-        "examples": "Ejemplos"
-    },
-    "DataExplorer_interface": {
-        "step1": "1. Ingrese una palabra de interés",
-        "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
-        "step3": "3. Seleccione conjuntos de interés",
-        "inputWord": {
-            "title": "",
-            "placeholder": "Ingresar aquí la palabra ..."
-        },
-        "wordInfoButton": "Obtener información de palabra",
-        "wordContextButton": "Buscar contextos",
-        "wordDistributionTitle": "Distribución de palabra en vocabulario",
-        "frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
-        "contextList": "Lista de contextos"
-    },
-    "CrowsPairs_interface": {
-        "title": "1. Ingrese frases a comparar",
-        "sent0": "Frase Nº 1 (*)",
-        "sent1": "Frase Nº 2 (*)",
-        "sent2": "Frase Nº 3 (Opcional)",
-        "sent3": "Frase Nº 4 (Opcional)",
-        "sent4": "Frase Nº 5 (Opcional)",
-        "sent5": "Frase Nº 6 (Opcional)",
-        "commonPlacholder": "Utilice comillas simples ' ' para destacar palabra/as de interés",
-        "compareButton": "Comparar",
-        "plot": "Visualización de proporciones",
-        "examples": "Ejemplos"
-    }
-}

modules/model_embbeding.py CHANGED Viewed

@@ -89,12 +89,16 @@ class Embedding:
             pca = PCA(
                 n_components=2
             )
-        model = KeyedVectors.load_word2vec_format(
-            fname=path,
-            binary=path.endswith('.bin'),
-            limit=limit
-        )
         # Cased Vocab
         cased_words = model.index_to_key

             pca = PCA(
                 n_components=2
             )
+        try:
+            model = KeyedVectors.load_word2vec_format(
+                    fname=path,
+                    binary=path.endswith('.bin'),
+                    limit=limit,
+                    unicode_errors='ignore'
+                )
+        except:
+            raise Exception(f"Can't load {path}. If it's a .bin extended file, only gensims c binary format are valid")
         # Cased Vocab
         cased_words = model.index_to_key

modules/module_BiasExplorer.py CHANGED Viewed

@@ -12,7 +12,7 @@ __all__ = ['WordBiasExplorer', 'WEBiasExplorer2Spaces', 'WEBiasExplorer4Spaces']
 class WordBiasExplorer:
     def __init__(
         self,
-        embedding  # Class Embedding instance
     ) -> None:
         self.embedding = embedding
@@ -265,7 +265,11 @@ class WordBiasExplorer:
         return None
 class WEBiasExplorer2Spaces(WordBiasExplorer):
-    def __init__(self, embedding) -> None:
         super().__init__(embedding)
     def calculate_bias(
@@ -375,7 +379,11 @@ class WEBiasExplorer2Spaces(WordBiasExplorer):
 class WEBiasExplorer4Spaces(WordBiasExplorer):
-    def __init__(self, embedding) -> None:
         super().__init__(embedding)
     def calculate_bias(
@@ -399,7 +407,7 @@ class WEBiasExplorer4Spaces(WordBiasExplorer):
             if not wordlist:
                 raise Exception('To plot with 4 spaces, you must enter at least one word in all lists')
-        err = self.check_oov(wordlist)
         if err:
             raise Exception(err)

 class WordBiasExplorer:
     def __init__(
         self,
+        embedding  # Embedding Class instance
     ) -> None:
         self.embedding = embedding
         return None
 class WEBiasExplorer2Spaces(WordBiasExplorer):
+    def __init__(
+        self,
+        embedding   # Embedding class instance
+    ) -> None:
         super().__init__(embedding)
     def calculate_bias(
 class WEBiasExplorer4Spaces(WordBiasExplorer):
+    def __init__(
+        self,
+        embedding   # Embedding Class instance
+    ) -> None:
         super().__init__(embedding)
     def calculate_bias(
             if not wordlist:
                 raise Exception('To plot with 4 spaces, you must enter at least one word in all lists')
+        err = self.check_oov(wordlists)
         if err:
             raise Exception(err)

modules/module_WordExplorer.py CHANGED Viewed

@@ -16,7 +16,7 @@ class WordToPlot:
         color: str,
         bias_space: int,
         alpha: float
-    ):
         self.word = word
         self.color = color
@@ -27,7 +27,7 @@ class WordToPlot:
 class WordExplorer:
     def __init__(
         self,
-        embedding   # Class Embedding instance
     ) -> None:
         self.embedding = embedding
@@ -43,7 +43,7 @@ class WordExplorer:
             out_msj = "Error: First you most enter a word!"
         else:
             if word not in self.embedding:
-                out_msj =  f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj

         color: str,
         bias_space: int,
         alpha: float
+    ) -> None:
         self.word = word
         self.color = color
 class WordExplorer:
     def __init__(
         self,
+        embedding   # Embedding Class instance
     ) -> None:
         self.embedding = embedding
             out_msj = "Error: First you most enter a word!"
         else:
             if word not in self.embedding:
+                out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj

modules/module_connection.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC
 from modules.module_WordExplorer import WordExplorer
-from modules.module_BiasExplorer import WEBiasExplorer2Spaces, WEBiasExplorer4Spaces
 from typing import List, Tuple

 from abc import ABC
 from modules.module_WordExplorer import WordExplorer
+from modules.module_BiasExplorer import WordBiasExplorer, WEBiasExplorer2Spaces, WEBiasExplorer4Spaces
 from typing import List, Tuple

modules/module_logsManager.py CHANGED Viewed

@@ -63,10 +63,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
-        assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
-        self.hf_token = hf_token
         self.dataset_name = dataset_name
         self.organization_name = organization
         self.dataset_private = private
         self.datetime = DateLogs()

             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
+        assert(dataset_name is not None), "Error: Parameter 'dataset_name' can not be empty!."
         self.dataset_name = dataset_name
+        self.hf_token = hf_token
         self.organization_name = organization
         self.dataset_private = private
         self.datetime = DateLogs()

tool.cfg ADDED Viewed

	@@ -0,0 +1,13 @@

+[INTERFACE]
+# ['es' | 'en']
+language            = en
+[WORD_EXPLORER]
+embeddings_path     = data/100k_en_embedding.vec
+# ['sklearn' | 'ann']
+nn_method           = sklearn
+max_neighbors       = 20
+[LOGS]
+# [True | False]
+available_logs      = False

tool_info.py CHANGED Viewed

@@ -4,7 +4,7 @@ TOOL_INFO = """
 * [Read Full Paper](https://arxiv.org/abs/2207.06591)
 > ### Licensing Information
-* [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_en/resolve/main/LICENSE)
 > ### Citation Information
 ```c

 * [Read Full Paper](https://arxiv.org/abs/2207.06591)
 > ### Licensing Information
+* [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_es/resolve/main/LICENSE)
 > ### Citation Information
 ```c