Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
•
c3c241a
1
Parent(s):
df7c120
Updated dashboard and util scripts
Browse files- app.py +1 -1
- run-analysis.py +4 -4
- utils.py +8 -8
app.py
CHANGED
@@ -24,7 +24,7 @@ metrics_size = metrics_size_audio + metrics_size_text
|
|
24 |
metrics_features_text_uniq = ["utts_unique", "words_unique", "chars_unique"]
|
25 |
metrics_features_speech_rate = ["words_per_sec", "chars_per_sec"]
|
26 |
metrics_features_duration = ["average_audio_duration[s]", "average_utterance_length[words]", "average_utterance_length[chars]"]
|
27 |
-
metrics_features_meta = ["
|
28 |
metrics_features = metrics_features_text_uniq + metrics_features_speech_rate + metrics_features_duration + metrics_features_meta
|
29 |
|
30 |
|
|
|
24 |
metrics_features_text_uniq = ["utts_unique", "words_unique", "chars_unique"]
|
25 |
metrics_features_speech_rate = ["words_per_sec", "chars_per_sec"]
|
26 |
metrics_features_duration = ["average_audio_duration[s]", "average_utterance_length[words]", "average_utterance_length[chars]"]
|
27 |
+
metrics_features_meta = ["meta_cov_gender", "meta_cov_age"]
|
28 |
metrics_features = metrics_features_text_uniq + metrics_features_speech_rate + metrics_features_duration + metrics_features_meta
|
29 |
|
30 |
|
run-analysis.py
CHANGED
@@ -48,7 +48,7 @@ dataset_contents = {}
|
|
48 |
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")
|
49 |
|
50 |
# specify features to load. Skip loading of audio data
|
51 |
-
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), '
|
52 |
|
53 |
for config_name in dataset_configs:
|
54 |
print("Generating stats for {}".format(config_name))
|
@@ -88,11 +88,11 @@ for config_name in dataset_configs:
|
|
88 |
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
|
89 |
|
90 |
# metadata coverage per subset in percent - speaker accent
|
91 |
-
dataset_statistics[config_name]["
|
92 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
93 |
|
94 |
# speech rate per subset
|
95 |
-
dataset_statistics[config_name]["
|
96 |
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
|
97 |
|
98 |
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
|
@@ -100,7 +100,7 @@ for config_name in dataset_configs:
|
|
100 |
|
101 |
# distribution of audio duration per subset
|
102 |
output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
|
103 |
-
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', '
|
104 |
|
105 |
# distribution of audio duration per age
|
106 |
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')
|
|
|
48 |
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")
|
49 |
|
50 |
# specify features to load. Skip loading of audio data
|
51 |
+
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_gender': Value(dtype='string', id=None)})
|
52 |
|
53 |
for config_name in dataset_configs:
|
54 |
print("Generating stats for {}".format(config_name))
|
|
|
88 |
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
|
89 |
|
90 |
# metadata coverage per subset in percent - speaker accent
|
91 |
+
dataset_statistics[config_name]["meta_cov_gender"] = meta_cov_per_split(dataset_hf_subset, 'speaker_gender')
|
92 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
93 |
|
94 |
# speech rate per subset
|
95 |
+
dataset_statistics[config_name]["meta_dist_gender"] = meta_distribution_text(dataset_hf_subset, 'speaker_gender')
|
96 |
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
|
97 |
|
98 |
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
|
|
|
100 |
|
101 |
# distribution of audio duration per subset
|
102 |
output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
|
103 |
+
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_gender')
|
104 |
|
105 |
# distribution of audio duration per age
|
106 |
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')
|
utils.py
CHANGED
@@ -8,7 +8,7 @@ import numpy as np
|
|
8 |
|
9 |
# move to consts
|
10 |
buckets_age=['teens','twenties', 'thirties', 'fourties', 'fifties', 'sixties', 'seventies', 'eighties', 'nineties']
|
11 |
-
|
12 |
|
13 |
def load_bigos_analyzer_report(fp:str)->dict:
|
14 |
with open(fp, 'r') as f:
|
@@ -288,8 +288,8 @@ def meta_cov_per_split(dataset_hf, meta_field):
|
|
288 |
# TODO move to config
|
289 |
if meta_field == 'speaker_age':
|
290 |
buckets = buckets_age
|
291 |
-
if meta_field == '
|
292 |
-
buckets =
|
293 |
out_dict = {}
|
294 |
metric = "meta_cov_" + meta_field
|
295 |
print("Calculating {}".format(metric))
|
@@ -378,8 +378,8 @@ def meta_distribution_text(dataset_hf, meta_field):
|
|
378 |
no_meta=False
|
379 |
if meta_field == 'speaker_age':
|
380 |
buckets = buckets_age
|
381 |
-
if meta_field == '
|
382 |
-
buckets =
|
383 |
|
384 |
# input - huggingface dataset object
|
385 |
# output - dictionary with statistics about audio duration per split
|
@@ -494,12 +494,12 @@ def recordings_per_speaker(dataset_hf):
|
|
494 |
return out_dict_stats, out_dict_contents
|
495 |
|
496 |
|
497 |
-
def meta_distribution_bar_plot(dataset_hf, output_dir, dimension = "
|
498 |
pass
|
499 |
|
500 |
-
def meta_distribution_violin_plot(dataset_hf, output_dir, metric = "audio_duration_seconds", dimension = "
|
501 |
# input - huggingface dataset object
|
502 |
-
# output - figure with distribution of audio duration per
|
503 |
out_dict = {}
|
504 |
|
505 |
print("Generating violin plat for metric {} for dimension {}".format(metric, dimension))
|
|
|
8 |
|
9 |
# move to consts
|
10 |
buckets_age=['teens','twenties', 'thirties', 'fourties', 'fifties', 'sixties', 'seventies', 'eighties', 'nineties']
|
11 |
+
buckets_gender=["male", "female"]
|
12 |
|
13 |
def load_bigos_analyzer_report(fp:str)->dict:
|
14 |
with open(fp, 'r') as f:
|
|
|
288 |
# TODO move to config
|
289 |
if meta_field == 'speaker_age':
|
290 |
buckets = buckets_age
|
291 |
+
if meta_field == 'speaker_gender':
|
292 |
+
buckets = buckets_gender
|
293 |
out_dict = {}
|
294 |
metric = "meta_cov_" + meta_field
|
295 |
print("Calculating {}".format(metric))
|
|
|
378 |
no_meta=False
|
379 |
if meta_field == 'speaker_age':
|
380 |
buckets = buckets_age
|
381 |
+
if meta_field == 'speaker_gender':
|
382 |
+
buckets = buckets_gender
|
383 |
|
384 |
# input - huggingface dataset object
|
385 |
# output - dictionary with statistics about audio duration per split
|
|
|
494 |
return out_dict_stats, out_dict_contents
|
495 |
|
496 |
|
497 |
+
def meta_distribution_bar_plot(dataset_hf, output_dir, dimension = "speaker_gender"):
|
498 |
pass
|
499 |
|
500 |
+
def meta_distribution_violin_plot(dataset_hf, output_dir, metric = "audio_duration_seconds", dimension = "speaker_gender"):
|
501 |
# input - huggingface dataset object
|
502 |
+
# output - figure with distribution of audio duration per gender
|
503 |
out_dict = {}
|
504 |
|
505 |
print("Generating violin plat for metric {} for dimension {}".format(metric, dimension))
|