Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 7, 2021

Commit

96acdf5

•

1 Parent(s): 4f4c0c4

Adding docstrings to run_data_measurements CLI

Browse files

Files changed (1) hide show

run_data_measurements.py +22 -21

run_data_measurements.py CHANGED Viewed

@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
     Args:
-        ds_args:
-        show_embeddings:
-        use_cache:
     Returns:
     """
     if not isdir(ds_args["cache_dir"]):
@@ -58,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     dstats.load_or_prepare_zipf()
-def load_or_prepare(dataset_args, do_html=False, use_cache=False):
     all = False
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
     print("Loading dataset.")
@@ -86,8 +96,8 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
     if all or dataset_args["calculation"] == "labels":
         if not dstats.label_field:
-            print("Warning: You asked for label calculation, but didn't provide "
-                  "the labels field name.  Assuming it is 'label'...")
             dstats.set_label_field("label")
         else:
             print("\n* Calculating label distribution.")
@@ -106,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
         npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
             dstats, use_cache=use_cache
         )
-        do_npmi(npmi_stats, use_cache=use_cache)
         print("Done!")
         print(
             "nPMI results now available in %s for all identity terms that "
@@ -137,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
         dstats.load_or_prepare_embeddings()
-def do_npmi(npmi_stats, use_cache=True):
     available_terms = npmi_stats.load_or_prepare_npmi_terms()
     completed_pairs = {}
     print("Iterating through terms for joint npmi.")
@@ -160,7 +170,6 @@ def get_text_label_df(
     label_field,
     calculation,
     out_dir,
-    do_html=False,
     use_cache=True,
 ):
     if not use_cache:
@@ -268,17 +277,9 @@ def main():
     print("Proceeding with the following arguments:")
     print(args)
     # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
-    get_text_label_df(
-        args.dataset,
-        args.config,
-        args.split,
-        args.feature,
-        args.label_field,
-        args.calculation,
-        args.out_dir,
-        do_html=args.do_html,
-        use_cache=args.cached,
-    )
     print()

 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
+    Does not take specifications from user.
     Args:
+        ds_args: Dataset configuration settings (config name, split, etc)
+        show_embeddings: Whether to compute embeddings (slow)
+        use_cache: Whether to grab files that have already been computed
     Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
     """
     if not isdir(ds_args["cache_dir"]):
     dstats.load_or_prepare_zipf()
+def load_or_prepare(dataset_args, use_cache=False):
+    """
+    Users can specify which aspects of the dataset they would like to compute.
+    Args:
+        dataset_args: Dataset configuration settings (config name, split, etc)
+        use_cache: Whether to grab files that have already been computed
+    Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
+    """
     all = False
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
     print("Loading dataset.")
     if all or dataset_args["calculation"] == "labels":
         if not dstats.label_field:
+            print("Warning: You asked for label calculation, but didn't "
+                  "provide the labels field name.  Assuming it is 'label'...")
             dstats.set_label_field("label")
         else:
             print("\n* Calculating label distribution.")
         npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
             dstats, use_cache=use_cache
         )
+        do_npmi(npmi_stats)
         print("Done!")
         print(
             "nPMI results now available in %s for all identity terms that "
         dstats.load_or_prepare_embeddings()
+def do_npmi(npmi_stats):
     available_terms = npmi_stats.load_or_prepare_npmi_terms()
     completed_pairs = {}
     print("Iterating through terms for joint npmi.")
     label_field,
     calculation,
     out_dir,
     use_cache=True,
 ):
     if not use_cache:
     print("Proceeding with the following arguments:")
     print(args)
     # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
+    get_text_label_df(args.dataset, args.config, args.split, args.feature,
+                      args.label_field, args.calculation, args.out_dir,
+                      use_cache=args.cached)
     print()