Yacine Jernite commited on
Commit
d437368
2 Parent(s): 2af16ed dffdb92

Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main

Browse files
cache_dir/wikitext_wikitext-103-v1_train_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 7c77df667d1ea9a678fc9c3f67ea6e066774949926e2e71198ff8f5b062bb885
  • Pointer size: 130 Bytes
  • Size of remote file: 37 kB
cache_dir/wikitext_wikitext-103-v1_validation_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 0f06c1bda3ad86339023fc1d6f95ac1145c629e0f1c003df59949986e40dc897
  • Pointer size: 130 Bytes
  • Size of remote file: 44.8 kB
cache_dir/wikitext_wikitext-2-v1_train_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 2a08287707e4b9430a9f0e6678269212b854ab21b3a34614ec7f15158ba22e93
  • Pointer size: 130 Bytes
  • Size of remote file: 39.6 kB
cache_dir/wikitext_wikitext-2-v1_validation_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 08721bbfc146992d3954e4c4dd54cd12da4441782a2be879e68f129bd5cb6044
  • Pointer size: 130 Bytes
  • Size of remote file: 44.8 kB
data_measurements/dataset_statistics.py CHANGED
@@ -455,7 +455,7 @@ class DatasetStatisticsCacheClass:
455
  self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
456
  else:
457
  logs.info("Calculating vocab afresh")
458
- if len(self.tokenized_df) == 0:
459
  self.tokenized_df = self.do_tokenization()
460
  if save:
461
  logs.info("Writing out.")
 
455
  self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
456
  else:
457
  logs.info("Calculating vocab afresh")
458
+ if self.tokenized_df is None:
459
  self.tokenized_df = self.do_tokenization()
460
  if save:
461
  logs.info("Writing out.")
run_data_measurements.py CHANGED
@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
  Loader specifically for the widgets used in the app.
 
15
  Args:
16
- ds_args:
17
- show_embeddings:
18
- use_cache:
19
 
20
  Returns:
21
-
22
  """
23
 
24
  if not isdir(ds_args["cache_dir"]):
@@ -30,13 +31,15 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
30
 
31
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
32
  use_cache=use_cache)
 
 
33
  # Header widget
34
  dstats.load_or_prepare_dset_peek()
35
  # General stats widget
36
  dstats.load_or_prepare_general_stats()
37
  # Labels widget
38
  try:
39
- dstats.set_label_field("label")
40
  dstats.load_or_prepare_labels()
41
  except:
42
  pass
@@ -56,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
56
  dstats.load_or_prepare_zipf()
57
 
58
 
59
- def load_or_prepare(dataset_args, do_html=False, use_cache=False):
 
 
 
 
 
 
 
 
 
60
  all = False
61
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
62
  print("Loading dataset.")
@@ -79,22 +91,15 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
79
 
80
  if all or dataset_args["calculation"] == "lengths":
81
  print("\n* Calculating text lengths.")
82
- fig_tok_length_fid = pjoin(dstats.cache_path, "lengths_fig.html")
83
- tok_length_json_fid = pjoin(dstats.cache_path, "lengths.json")
84
  dstats.load_or_prepare_text_lengths()
85
- with open(tok_length_json_fid, "w+") as f:
86
- json.dump(dstats.fig_tok_length.to_json(), f)
87
- print("Token lengths now available at %s." % tok_length_json_fid)
88
- if do_html:
89
- dstats.fig_tok_length.write_html(fig_tok_length_fid)
90
- print("Figure saved to %s." % fig_tok_length_fid)
91
  print("Done!")
92
 
93
  if all or dataset_args["calculation"] == "labels":
94
  if not dstats.label_field:
95
- print("Warning: You asked for label calculation, but didn't provide "
96
- "the labels field name. Assuming it is 'label'...")
97
  dstats.set_label_field("label")
 
98
  print("\n* Calculating label distribution.")
99
  dstats.load_or_prepare_labels()
100
  fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
@@ -111,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
111
  npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
112
  dstats, use_cache=use_cache
113
  )
114
- do_npmi(npmi_stats, use_cache=use_cache)
115
  print("Done!")
116
  print(
117
  "nPMI results now available in %s for all identity terms that "
@@ -142,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
142
  dstats.load_or_prepare_embeddings()
143
 
144
 
145
- def do_npmi(npmi_stats, use_cache=True):
146
  available_terms = npmi_stats.load_or_prepare_npmi_terms()
147
  completed_pairs = {}
148
  print("Iterating through terms for joint npmi.")
@@ -165,7 +170,6 @@ def get_text_label_df(
165
  label_field,
166
  calculation,
167
  out_dir,
168
- do_html=False,
169
  use_cache=True,
170
  ):
171
  if not use_cache:
@@ -190,7 +194,7 @@ def get_text_label_df(
190
  "calculation": calculation,
191
  "cache_dir": out_dir,
192
  }
193
- load_or_prepare_widgets(dataset_args, use_cache=use_cache)
194
 
195
 
196
  def main():
@@ -272,18 +276,10 @@ def main():
272
  args = parser.parse_args()
273
  print("Proceeding with the following arguments:")
274
  print(args)
275
- # run_data_measurements.py -n hate_speech18 -c default -s train -f text -w npmi
276
- get_text_label_df(
277
- args.dataset,
278
- args.config,
279
- args.split,
280
- args.feature,
281
- args.label_field,
282
- args.calculation,
283
- args.out_dir,
284
- do_html=args.do_html,
285
- use_cache=args.cached,
286
- )
287
  print()
288
 
289
 
 
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
  Loader specifically for the widgets used in the app.
15
+ Does not take specifications from user.
16
  Args:
17
+ ds_args: Dataset configuration settings (config name, split, etc)
18
+ show_embeddings: Whether to compute embeddings (slow)
19
+ use_cache: Whether to grab files that have already been computed
20
 
21
  Returns:
22
+ Saves files to disk in cache_dir, if user has not specified another dir.
23
  """
24
 
25
  if not isdir(ds_args["cache_dir"]):
 
31
 
32
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
33
  use_cache=use_cache)
34
+ # Embeddings widget
35
+ dstats.load_or_prepare_dataset()
36
  # Header widget
37
  dstats.load_or_prepare_dset_peek()
38
  # General stats widget
39
  dstats.load_or_prepare_general_stats()
40
  # Labels widget
41
  try:
42
+ dstats.set_label_field(ds_args['label_field'])
43
  dstats.load_or_prepare_labels()
44
  except:
45
  pass
 
59
  dstats.load_or_prepare_zipf()
60
 
61
 
62
+ def load_or_prepare(dataset_args, use_cache=False):
63
+ """
64
+ Users can specify which aspects of the dataset they would like to compute.
65
+ Args:
66
+ dataset_args: Dataset configuration settings (config name, split, etc)
67
+ use_cache: Whether to grab files that have already been computed
68
+
69
+ Returns:
70
+ Saves files to disk in cache_dir, if user has not specified another dir.
71
+ """
72
  all = False
73
  dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
74
  print("Loading dataset.")
 
91
 
92
  if all or dataset_args["calculation"] == "lengths":
93
  print("\n* Calculating text lengths.")
 
 
94
  dstats.load_or_prepare_text_lengths()
 
 
 
 
 
 
95
  print("Done!")
96
 
97
  if all or dataset_args["calculation"] == "labels":
98
  if not dstats.label_field:
99
+ print("Warning: You asked for label calculation, but didn't "
100
+ "provide the labels field name. Assuming it is 'label'...")
101
  dstats.set_label_field("label")
102
+ else:
103
  print("\n* Calculating label distribution.")
104
  dstats.load_or_prepare_labels()
105
  fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
 
116
  npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
117
  dstats, use_cache=use_cache
118
  )
119
+ do_npmi(npmi_stats)
120
  print("Done!")
121
  print(
122
  "nPMI results now available in %s for all identity terms that "
 
147
  dstats.load_or_prepare_embeddings()
148
 
149
 
150
+ def do_npmi(npmi_stats):
151
  available_terms = npmi_stats.load_or_prepare_npmi_terms()
152
  completed_pairs = {}
153
  print("Iterating through terms for joint npmi.")
 
170
  label_field,
171
  calculation,
172
  out_dir,
 
173
  use_cache=True,
174
  ):
175
  if not use_cache:
 
194
  "calculation": calculation,
195
  "cache_dir": out_dir,
196
  }
197
+ load_or_prepare(dataset_args, use_cache=use_cache)
198
 
199
 
200
  def main():
 
276
  args = parser.parse_args()
277
  print("Proceeding with the following arguments:")
278
  print(args)
279
+ # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
280
+ get_text_label_df(args.dataset, args.config, args.split, args.feature,
281
+ args.label_field, args.calculation, args.out_dir,
282
+ use_cache=args.cached)
 
 
 
 
 
 
 
 
283
  print()
284
 
285