Spaces:

aseifert
/

ExplaiNER

Runtime error

Alexander Seifert commited on Jun 29, 2022

Commit

bb162b6

•

1 Parent(s): 408486e

add randomize_sample option

Files changed (4) hide show

README.md CHANGED Viewed

@@ -19,7 +19,7 @@ Error Analysis is an important but often overlooked part of the data science pro
 ### Activations
-A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model.
 ### Embeddings

 ### Activations
+A group of neurons tends to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model.
 ### Embeddings

src/data.py CHANGED Viewed

@@ -11,7 +11,9 @@ from src.utils import device, tokenizer_hash_funcs
 @st.cache(allow_output_mutation=True)
-def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size: int) -> Dataset:
     """Loads a Dataset from the HuggingFace hub (if not already loaded).
     Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details).
@@ -25,7 +27,9 @@ def get_data(ds_name: str, config_name: str, split_name: str, split_sample_size:
     Returns:
         Dataset: A Dataset object.
     """
-    ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(seed=0)  # type: ignore
     split = ds[split_name].select(range(split_sample_size))
     return split

 @st.cache(allow_output_mutation=True)
+def get_data(
+    ds_name: str, config_name: str, split_name: str, split_sample_size: int, randomize_sample: bool
+) -> Dataset:
     """Loads a Dataset from the HuggingFace hub (if not already loaded).
     Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details).
     Returns:
         Dataset: A Dataset object.
     """
+    ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(
+        seed=0 if randomize_sample else None
+    )  # type: ignore
     split = ds[split_name].select(range(split_sample_size))
     return split

src/load.py CHANGED Viewed

@@ -37,6 +37,7 @@ def load_context(
     ds_config_name: str,
     ds_split_name: str,
     split_sample_size: int,
     **kw_args,
 ) -> Context:
     """Utility method loading (almost) everything we need for the application.
@@ -63,7 +64,9 @@ def load_context(
     collator = get_collator(tokenizer)
     # load data related stuff
-    split: Dataset = get_data(ds_name, ds_config_name, ds_split_name, split_sample_size)
     tags = split.features["ner_tags"].feature
     split_encoded, word_ids, ids = encode_dataset(split, tokenizer)

     ds_config_name: str,
     ds_split_name: str,
     split_sample_size: int,
+    randomize_sample: bool,
     **kw_args,
 ) -> Context:
     """Utility method loading (almost) everything we need for the application.
     collator = get_collator(tokenizer)
     # load data related stuff
+    split: Dataset = get_data(
+        ds_name, ds_config_name, ds_split_name, split_sample_size, randomize_sample
+    )
     tags = split.features["ner_tags"].feature
     split_encoded, word_ids, ids = encode_dataset(split, tokenizer)

src/subpages/home.py CHANGED Viewed

@@ -45,6 +45,7 @@ class HomePage(Page):
             "ds_split_name": "validation",
             "ds_config_name": _CONFIG_NAME,
             "split_sample_size": 512,
         }
     def render(self, context: Optional[Context] = None):
@@ -118,11 +119,18 @@ class HomePage(Page):
                     key="split_sample_size",
                     help="Sample size for the split, speeds up processing inside streamlit",
                 )
                 # breakpoint()
                 # st.form_submit_button("Submit")
                 st.form_submit_button("Load Model & Data")
-        split = get_data(ds_name, ds_config_name, ds_split_name, split_sample_size)
         labels = list(
             set([n.split("-")[1] for n in split.features["ner_tags"].feature.names if n != "O"])
         )

             "ds_split_name": "validation",
             "ds_config_name": _CONFIG_NAME,
             "split_sample_size": 512,
+            "randomize_sample": True,
         }
     def render(self, context: Optional[Context] = None):
                     key="split_sample_size",
                     help="Sample size for the split, speeds up processing inside streamlit",
                 )
+                randomize_sample = st.checkbox(
+                    "Randomize sample",
+                    key="randomize_sample",
+                    help="Whether to randomize the sample",
+                )
                 # breakpoint()
                 # st.form_submit_button("Submit")
                 st.form_submit_button("Load Model & Data")
+        split = get_data(
+            ds_name, ds_config_name, ds_split_name, split_sample_size, randomize_sample  # type: ignore
+        )
         labels = list(
             set([n.split("-")[1] for n in split.features["ner_tags"].feature.names if n != "O"])
         )