meg-huggingface commited on
Commit
96acdf5
1 Parent(s): 4f4c0c4

Adding docstrings to run_data_measurements CLI

Browse files
Files changed (1) hide show
  1. run_data_measurements.py +22 -21
run_data_measurements.py CHANGED
@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
  Loader specifically for the widgets used in the app.
 
15
  Args:
16
- ds_args:
17
- show_embeddings:
18
- use_cache:
19
 
20
  Returns:
21
-
22
  """
23
 
24
  if not isdir(ds_args["cache_dir"]):
@@ -58,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
58
  dstats.load_or_prepare_zipf()
59
 
60
 
61
- def load_or_prepare(dataset_args, do_html=False, use_cache=False):
 
 
 
 
 
 
 
 
 
62
  all = False
63
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
64
  print("Loading dataset.")
@@ -86,8 +96,8 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
86
 
87
  if all or dataset_args["calculation"] == "labels":
88
  if not dstats.label_field:
89
- print("Warning: You asked for label calculation, but didn't provide "
90
- "the labels field name. Assuming it is 'label'...")
91
  dstats.set_label_field("label")
92
  else:
93
  print("\n* Calculating label distribution.")
@@ -106,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
106
  npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
107
  dstats, use_cache=use_cache
108
  )
109
- do_npmi(npmi_stats, use_cache=use_cache)
110
  print("Done!")
111
  print(
112
  "nPMI results now available in %s for all identity terms that "
@@ -137,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
137
  dstats.load_or_prepare_embeddings()
138
 
139
 
140
- def do_npmi(npmi_stats, use_cache=True):
141
  available_terms = npmi_stats.load_or_prepare_npmi_terms()
142
  completed_pairs = {}
143
  print("Iterating through terms for joint npmi.")
@@ -160,7 +170,6 @@ def get_text_label_df(
160
  label_field,
161
  calculation,
162
  out_dir,
163
- do_html=False,
164
  use_cache=True,
165
  ):
166
  if not use_cache:
@@ -268,17 +277,9 @@ def main():
268
  print("Proceeding with the following arguments:")
269
  print(args)
270
  # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
271
- get_text_label_df(
272
- args.dataset,
273
- args.config,
274
- args.split,
275
- args.feature,
276
- args.label_field,
277
- args.calculation,
278
- args.out_dir,
279
- do_html=args.do_html,
280
- use_cache=args.cached,
281
- )
282
  print()
283
 
284
 
 
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
  Loader specifically for the widgets used in the app.
15
+ Does not take specifications from user.
16
  Args:
17
+ ds_args: Dataset configuration settings (config name, split, etc)
18
+ show_embeddings: Whether to compute embeddings (slow)
19
+ use_cache: Whether to grab files that have already been computed
20
 
21
  Returns:
22
+ Saves files to disk in cache_dir, if user has not specified another dir.
23
  """
24
 
25
  if not isdir(ds_args["cache_dir"]):
 
59
  dstats.load_or_prepare_zipf()
60
 
61
 
62
+ def load_or_prepare(dataset_args, use_cache=False):
63
+ """
64
+ Users can specify which aspects of the dataset they would like to compute.
65
+ Args:
66
+ dataset_args: Dataset configuration settings (config name, split, etc)
67
+ use_cache: Whether to grab files that have already been computed
68
+
69
+ Returns:
70
+ Saves files to disk in cache_dir, if user has not specified another dir.
71
+ """
72
  all = False
73
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
74
  print("Loading dataset.")
 
96
 
97
  if all or dataset_args["calculation"] == "labels":
98
  if not dstats.label_field:
99
+ print("Warning: You asked for label calculation, but didn't "
100
+ "provide the labels field name. Assuming it is 'label'...")
101
  dstats.set_label_field("label")
102
  else:
103
  print("\n* Calculating label distribution.")
 
116
  npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
117
  dstats, use_cache=use_cache
118
  )
119
+ do_npmi(npmi_stats)
120
  print("Done!")
121
  print(
122
  "nPMI results now available in %s for all identity terms that "
 
147
  dstats.load_or_prepare_embeddings()
148
 
149
 
150
+ def do_npmi(npmi_stats):
151
  available_terms = npmi_stats.load_or_prepare_npmi_terms()
152
  completed_pairs = {}
153
  print("Iterating through terms for joint npmi.")
 
170
  label_field,
171
  calculation,
172
  out_dir,
 
173
  use_cache=True,
174
  ):
175
  if not use_cache:
 
277
  print("Proceeding with the following arguments:")
278
  print(args)
279
  # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
280
+ get_text_label_df(args.dataset, args.config, args.split, args.feature,
281
+ args.label_field, args.calculation, args.out_dir,
282
+ use_cache=args.cached)
 
 
 
 
 
 
 
 
283
  print()
284
 
285