Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
96acdf5
1
Parent(s):
4f4c0c4
Adding docstrings to run_data_measurements CLI
Browse files- run_data_measurements.py +22 -21
run_data_measurements.py
CHANGED
@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
|
|
12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
13 |
"""
|
14 |
Loader specifically for the widgets used in the app.
|
|
|
15 |
Args:
|
16 |
-
ds_args:
|
17 |
-
show_embeddings:
|
18 |
-
use_cache:
|
19 |
|
20 |
Returns:
|
21 |
-
|
22 |
"""
|
23 |
|
24 |
if not isdir(ds_args["cache_dir"]):
|
@@ -58,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
|
58 |
dstats.load_or_prepare_zipf()
|
59 |
|
60 |
|
61 |
-
def load_or_prepare(dataset_args,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
all = False
|
63 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
64 |
print("Loading dataset.")
|
@@ -86,8 +96,8 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
86 |
|
87 |
if all or dataset_args["calculation"] == "labels":
|
88 |
if not dstats.label_field:
|
89 |
-
print("Warning: You asked for label calculation, but didn't
|
90 |
-
"the labels field name. Assuming it is 'label'...")
|
91 |
dstats.set_label_field("label")
|
92 |
else:
|
93 |
print("\n* Calculating label distribution.")
|
@@ -106,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
106 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
107 |
dstats, use_cache=use_cache
|
108 |
)
|
109 |
-
do_npmi(npmi_stats
|
110 |
print("Done!")
|
111 |
print(
|
112 |
"nPMI results now available in %s for all identity terms that "
|
@@ -137,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
137 |
dstats.load_or_prepare_embeddings()
|
138 |
|
139 |
|
140 |
-
def do_npmi(npmi_stats
|
141 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
142 |
completed_pairs = {}
|
143 |
print("Iterating through terms for joint npmi.")
|
@@ -160,7 +170,6 @@ def get_text_label_df(
|
|
160 |
label_field,
|
161 |
calculation,
|
162 |
out_dir,
|
163 |
-
do_html=False,
|
164 |
use_cache=True,
|
165 |
):
|
166 |
if not use_cache:
|
@@ -268,17 +277,9 @@ def main():
|
|
268 |
print("Proceeding with the following arguments:")
|
269 |
print(args)
|
270 |
# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
|
271 |
-
get_text_label_df(
|
272 |
-
|
273 |
-
|
274 |
-
args.split,
|
275 |
-
args.feature,
|
276 |
-
args.label_field,
|
277 |
-
args.calculation,
|
278 |
-
args.out_dir,
|
279 |
-
do_html=args.do_html,
|
280 |
-
use_cache=args.cached,
|
281 |
-
)
|
282 |
print()
|
283 |
|
284 |
|
|
|
12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
13 |
"""
|
14 |
Loader specifically for the widgets used in the app.
|
15 |
+
Does not take specifications from user.
|
16 |
Args:
|
17 |
+
ds_args: Dataset configuration settings (config name, split, etc)
|
18 |
+
show_embeddings: Whether to compute embeddings (slow)
|
19 |
+
use_cache: Whether to grab files that have already been computed
|
20 |
|
21 |
Returns:
|
22 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
23 |
"""
|
24 |
|
25 |
if not isdir(ds_args["cache_dir"]):
|
|
|
59 |
dstats.load_or_prepare_zipf()
|
60 |
|
61 |
|
62 |
+
def load_or_prepare(dataset_args, use_cache=False):
|
63 |
+
"""
|
64 |
+
Users can specify which aspects of the dataset they would like to compute.
|
65 |
+
Args:
|
66 |
+
dataset_args: Dataset configuration settings (config name, split, etc)
|
67 |
+
use_cache: Whether to grab files that have already been computed
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
71 |
+
"""
|
72 |
all = False
|
73 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
74 |
print("Loading dataset.")
|
|
|
96 |
|
97 |
if all or dataset_args["calculation"] == "labels":
|
98 |
if not dstats.label_field:
|
99 |
+
print("Warning: You asked for label calculation, but didn't "
|
100 |
+
"provide the labels field name. Assuming it is 'label'...")
|
101 |
dstats.set_label_field("label")
|
102 |
else:
|
103 |
print("\n* Calculating label distribution.")
|
|
|
116 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
117 |
dstats, use_cache=use_cache
|
118 |
)
|
119 |
+
do_npmi(npmi_stats)
|
120 |
print("Done!")
|
121 |
print(
|
122 |
"nPMI results now available in %s for all identity terms that "
|
|
|
147 |
dstats.load_or_prepare_embeddings()
|
148 |
|
149 |
|
150 |
+
def do_npmi(npmi_stats):
|
151 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
152 |
completed_pairs = {}
|
153 |
print("Iterating through terms for joint npmi.")
|
|
|
170 |
label_field,
|
171 |
calculation,
|
172 |
out_dir,
|
|
|
173 |
use_cache=True,
|
174 |
):
|
175 |
if not use_cache:
|
|
|
277 |
print("Proceeding with the following arguments:")
|
278 |
print(args)
|
279 |
# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
|
280 |
+
get_text_label_df(args.dataset, args.config, args.split, args.feature,
|
281 |
+
args.label_field, args.calculation, args.out_dir,
|
282 |
+
use_cache=args.cached)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
print()
|
284 |
|
285 |
|