Spaces:
Build error
Build error
Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main
Browse files- cache_dir/wikitext_wikitext-103-v1_train_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-103-v1_validation_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-2-v1_train_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-2-v1_validation_text/fig_tok_length.png +3 -0
- data_measurements/dataset_statistics.py +1 -1
- run_data_measurements.py +28 -32
cache_dir/wikitext_wikitext-103-v1_train_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-103-v1_validation_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-2-v1_train_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-2-v1_validation_text/fig_tok_length.png
ADDED
Git LFS Details
|
data_measurements/dataset_statistics.py
CHANGED
@@ -455,7 +455,7 @@ class DatasetStatisticsCacheClass:
|
|
455 |
self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
|
456 |
else:
|
457 |
logs.info("Calculating vocab afresh")
|
458 |
-
if
|
459 |
self.tokenized_df = self.do_tokenization()
|
460 |
if save:
|
461 |
logs.info("Writing out.")
|
|
|
455 |
self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
|
456 |
else:
|
457 |
logs.info("Calculating vocab afresh")
|
458 |
+
if self.tokenized_df is None:
|
459 |
self.tokenized_df = self.do_tokenization()
|
460 |
if save:
|
461 |
logs.info("Writing out.")
|
run_data_measurements.py
CHANGED
@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
|
|
12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
13 |
"""
|
14 |
Loader specifically for the widgets used in the app.
|
|
|
15 |
Args:
|
16 |
-
ds_args:
|
17 |
-
show_embeddings:
|
18 |
-
use_cache:
|
19 |
|
20 |
Returns:
|
21 |
-
|
22 |
"""
|
23 |
|
24 |
if not isdir(ds_args["cache_dir"]):
|
@@ -30,13 +31,15 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
|
30 |
|
31 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
|
32 |
use_cache=use_cache)
|
|
|
|
|
33 |
# Header widget
|
34 |
dstats.load_or_prepare_dset_peek()
|
35 |
# General stats widget
|
36 |
dstats.load_or_prepare_general_stats()
|
37 |
# Labels widget
|
38 |
try:
|
39 |
-
dstats.set_label_field(
|
40 |
dstats.load_or_prepare_labels()
|
41 |
except:
|
42 |
pass
|
@@ -56,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
|
56 |
dstats.load_or_prepare_zipf()
|
57 |
|
58 |
|
59 |
-
def load_or_prepare(dataset_args,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
all = False
|
61 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
62 |
print("Loading dataset.")
|
@@ -79,22 +91,15 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
79 |
|
80 |
if all or dataset_args["calculation"] == "lengths":
|
81 |
print("\n* Calculating text lengths.")
|
82 |
-
fig_tok_length_fid = pjoin(dstats.cache_path, "lengths_fig.html")
|
83 |
-
tok_length_json_fid = pjoin(dstats.cache_path, "lengths.json")
|
84 |
dstats.load_or_prepare_text_lengths()
|
85 |
-
with open(tok_length_json_fid, "w+") as f:
|
86 |
-
json.dump(dstats.fig_tok_length.to_json(), f)
|
87 |
-
print("Token lengths now available at %s." % tok_length_json_fid)
|
88 |
-
if do_html:
|
89 |
-
dstats.fig_tok_length.write_html(fig_tok_length_fid)
|
90 |
-
print("Figure saved to %s." % fig_tok_length_fid)
|
91 |
print("Done!")
|
92 |
|
93 |
if all or dataset_args["calculation"] == "labels":
|
94 |
if not dstats.label_field:
|
95 |
-
print("Warning: You asked for label calculation, but didn't
|
96 |
-
"the labels field name. Assuming it is 'label'...")
|
97 |
dstats.set_label_field("label")
|
|
|
98 |
print("\n* Calculating label distribution.")
|
99 |
dstats.load_or_prepare_labels()
|
100 |
fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
|
@@ -111,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
111 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
112 |
dstats, use_cache=use_cache
|
113 |
)
|
114 |
-
do_npmi(npmi_stats
|
115 |
print("Done!")
|
116 |
print(
|
117 |
"nPMI results now available in %s for all identity terms that "
|
@@ -142,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
142 |
dstats.load_or_prepare_embeddings()
|
143 |
|
144 |
|
145 |
-
def do_npmi(npmi_stats
|
146 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
147 |
completed_pairs = {}
|
148 |
print("Iterating through terms for joint npmi.")
|
@@ -165,7 +170,6 @@ def get_text_label_df(
|
|
165 |
label_field,
|
166 |
calculation,
|
167 |
out_dir,
|
168 |
-
do_html=False,
|
169 |
use_cache=True,
|
170 |
):
|
171 |
if not use_cache:
|
@@ -190,7 +194,7 @@ def get_text_label_df(
|
|
190 |
"calculation": calculation,
|
191 |
"cache_dir": out_dir,
|
192 |
}
|
193 |
-
|
194 |
|
195 |
|
196 |
def main():
|
@@ -272,18 +276,10 @@ def main():
|
|
272 |
args = parser.parse_args()
|
273 |
print("Proceeding with the following arguments:")
|
274 |
print(args)
|
275 |
-
# run_data_measurements.py -
|
276 |
-
get_text_label_df(
|
277 |
-
|
278 |
-
|
279 |
-
args.split,
|
280 |
-
args.feature,
|
281 |
-
args.label_field,
|
282 |
-
args.calculation,
|
283 |
-
args.out_dir,
|
284 |
-
do_html=args.do_html,
|
285 |
-
use_cache=args.cached,
|
286 |
-
)
|
287 |
print()
|
288 |
|
289 |
|
|
|
12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
13 |
"""
|
14 |
Loader specifically for the widgets used in the app.
|
15 |
+
Does not take specifications from user.
|
16 |
Args:
|
17 |
+
ds_args: Dataset configuration settings (config name, split, etc)
|
18 |
+
show_embeddings: Whether to compute embeddings (slow)
|
19 |
+
use_cache: Whether to grab files that have already been computed
|
20 |
|
21 |
Returns:
|
22 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
23 |
"""
|
24 |
|
25 |
if not isdir(ds_args["cache_dir"]):
|
|
|
31 |
|
32 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
|
33 |
use_cache=use_cache)
|
34 |
+
# Embeddings widget
|
35 |
+
dstats.load_or_prepare_dataset()
|
36 |
# Header widget
|
37 |
dstats.load_or_prepare_dset_peek()
|
38 |
# General stats widget
|
39 |
dstats.load_or_prepare_general_stats()
|
40 |
# Labels widget
|
41 |
try:
|
42 |
+
dstats.set_label_field(ds_args['label_field'])
|
43 |
dstats.load_or_prepare_labels()
|
44 |
except:
|
45 |
pass
|
|
|
59 |
dstats.load_or_prepare_zipf()
|
60 |
|
61 |
|
62 |
+
def load_or_prepare(dataset_args, use_cache=False):
|
63 |
+
"""
|
64 |
+
Users can specify which aspects of the dataset they would like to compute.
|
65 |
+
Args:
|
66 |
+
dataset_args: Dataset configuration settings (config name, split, etc)
|
67 |
+
use_cache: Whether to grab files that have already been computed
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
71 |
+
"""
|
72 |
all = False
|
73 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
74 |
print("Loading dataset.")
|
|
|
91 |
|
92 |
if all or dataset_args["calculation"] == "lengths":
|
93 |
print("\n* Calculating text lengths.")
|
|
|
|
|
94 |
dstats.load_or_prepare_text_lengths()
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
print("Done!")
|
96 |
|
97 |
if all or dataset_args["calculation"] == "labels":
|
98 |
if not dstats.label_field:
|
99 |
+
print("Warning: You asked for label calculation, but didn't "
|
100 |
+
"provide the labels field name. Assuming it is 'label'...")
|
101 |
dstats.set_label_field("label")
|
102 |
+
else:
|
103 |
print("\n* Calculating label distribution.")
|
104 |
dstats.load_or_prepare_labels()
|
105 |
fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
|
|
|
116 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
117 |
dstats, use_cache=use_cache
|
118 |
)
|
119 |
+
do_npmi(npmi_stats)
|
120 |
print("Done!")
|
121 |
print(
|
122 |
"nPMI results now available in %s for all identity terms that "
|
|
|
147 |
dstats.load_or_prepare_embeddings()
|
148 |
|
149 |
|
150 |
+
def do_npmi(npmi_stats):
|
151 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
152 |
completed_pairs = {}
|
153 |
print("Iterating through terms for joint npmi.")
|
|
|
170 |
label_field,
|
171 |
calculation,
|
172 |
out_dir,
|
|
|
173 |
use_cache=True,
|
174 |
):
|
175 |
if not use_cache:
|
|
|
194 |
"calculation": calculation,
|
195 |
"cache_dir": out_dir,
|
196 |
}
|
197 |
+
load_or_prepare(dataset_args, use_cache=use_cache)
|
198 |
|
199 |
|
200 |
def main():
|
|
|
276 |
args = parser.parse_args()
|
277 |
print("Proceeding with the following arguments:")
|
278 |
print(args)
|
279 |
+
# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
|
280 |
+
get_text_label_df(args.dataset, args.config, args.split, args.feature,
|
281 |
+
args.label_field, args.calculation, args.out_dir,
|
282 |
+
use_cache=args.cached)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
print()
|
284 |
|
285 |
|