Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
•
25f0e74
1
Parent(s):
5d90238
Added average audio utterance calculation
Browse files- app.py +3 -3
- reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json +1 -1
- reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json +2 -2
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json +1 -1
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json +2 -2
- run-analysis.py +12 -5
- utils.py +26 -2
app.py
CHANGED
@@ -64,7 +64,7 @@ with analysis_bigos:
|
|
64 |
st.dataframe(df_sum_stats_text)
|
65 |
|
66 |
|
67 |
-
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
|
68 |
|
69 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
70 |
|
@@ -131,12 +131,12 @@ with analysis_bigos_pelcra:
|
|
131 |
st.dataframe(df_sum_stats_text)
|
132 |
|
133 |
|
134 |
-
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
|
135 |
|
136 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
137 |
|
138 |
st.subheader("Dataset features (text)")
|
139 |
-
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:
|
140 |
st.dataframe(df_sum_stats_feats_text)
|
141 |
|
142 |
st.subheader("Dataset features (audio)")
|
|
|
64 |
st.dataframe(df_sum_stats_text)
|
65 |
|
66 |
|
67 |
+
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
|
68 |
|
69 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
70 |
|
|
|
131 |
st.dataframe(df_sum_stats_text)
|
132 |
|
133 |
|
134 |
+
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
|
135 |
|
136 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
137 |
|
138 |
st.subheader("Dataset features (text)")
|
139 |
+
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:3]]
|
140 |
st.dataframe(df_sum_stats_feats_text)
|
141 |
|
142 |
st.subheader("Dataset features (audio)")
|
reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 46668863
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43e808b081d9b692c2469396565fb967105fd815894a7eaded34e89969dbc890
|
3 |
size 46668863
|
reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0acb30a9a215f9c96b567b8753f565f400eac2366df6dba6248ccba859e190e3
|
3 |
+
size 23940
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 95274266
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cea38447dc7485c0f628eba6e52f45e24d1d467fbe23c065162d6b36455ab1d
|
3 |
size 95274266
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ab97523e5f4776bb566ed57c38126004bfac43f64bb3177e9ae39f1ee6e51d5
|
3 |
+
size 30399
|
run-analysis.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import json
|
3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
5 |
-
from utils import
|
6 |
#, uniq_utts_per_speaker
|
7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
8 |
import argparse
|
@@ -58,19 +58,26 @@ for config_name in dataset_configs:
|
|
58 |
if(args.secret_test_split):
|
59 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
60 |
|
|
|
61 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
62 |
-
dataset_statistics[config_name]["audio[h]"] =
|
63 |
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
|
64 |
|
|
|
65 |
# metrics based on transcriptions (references) - requires reading secret repo for test split
|
66 |
-
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
67 |
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
68 |
-
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
69 |
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
|
|
|
|
|
|
70 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
|
|
71 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
72 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
73 |
-
|
|
|
74 |
# metadata coverage per subset in percent - speaker accent
|
75 |
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
|
76 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
|
|
2 |
import json
|
3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
5 |
+
from utils import total_audio_duration_per_split, average_audio_duration_per_split, speakers_per_split, meta_cov_per_split
|
6 |
#, uniq_utts_per_speaker
|
7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
8 |
import argparse
|
|
|
58 |
if(args.secret_test_split):
|
59 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
60 |
|
61 |
+
#audio content size
|
62 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
63 |
+
dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
|
64 |
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
|
65 |
|
66 |
+
# text content size
|
67 |
# metrics based on transcriptions (references) - requires reading secret repo for test split
|
|
|
68 |
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
69 |
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
70 |
+
|
71 |
+
# text content derived features
|
72 |
+
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
73 |
+
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
74 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
75 |
+
|
76 |
+
# audio content derived features
|
77 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
78 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
79 |
+
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
|
80 |
+
|
81 |
# metadata coverage per subset in percent - speaker accent
|
82 |
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
|
83 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
utils.py
CHANGED
@@ -32,7 +32,7 @@ def num_of_samples_per_split(dataset_hf):
|
|
32 |
|
33 |
return out_dict
|
34 |
|
35 |
-
def
|
36 |
# input - huggingface dataset object
|
37 |
# output - dictionary with statistics about audio duration per split
|
38 |
out_dict = {}
|
@@ -52,6 +52,31 @@ def audio_duration_per_split(dataset_hf):
|
|
52 |
out_dict["all_splits"] = sum(out_dict.values())
|
53 |
return out_dict
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
def speakers_per_split(dataset_hf):
|
56 |
# input - huggingface dataset object
|
57 |
# output - dictionary with statistics about audio duration per split
|
@@ -350,7 +375,6 @@ def meta_distribution_text(dataset_hf, meta_field):
|
|
350 |
return out_dict
|
351 |
|
352 |
|
353 |
-
|
354 |
def recordings_per_speaker(dataset_hf):
|
355 |
recordings_per_speaker_stats_dict = {}
|
356 |
|
|
|
32 |
|
33 |
return out_dict
|
34 |
|
35 |
+
def total_audio_duration_per_split(dataset_hf):
|
36 |
# input - huggingface dataset object
|
37 |
# output - dictionary with statistics about audio duration per split
|
38 |
out_dict = {}
|
|
|
52 |
out_dict["all_splits"] = sum(out_dict.values())
|
53 |
return out_dict
|
54 |
|
55 |
+
|
56 |
+
def average_audio_duration_per_split(dataset_hf):
|
57 |
+
# input - huggingface dataset object
|
58 |
+
# output - dictionary with statistics about audio duration per split
|
59 |
+
out_dict = {}
|
60 |
+
metric = "average_audio_duration[s]"
|
61 |
+
print("Calculating {}".format(metric))
|
62 |
+
samples_all=0
|
63 |
+
audio_length_total_seconds=0
|
64 |
+
for split in dataset_hf.keys():
|
65 |
+
#sampling_rate = dataset_hf[split]["sampling_rate"][0]
|
66 |
+
#audio_total_length_samples = 0
|
67 |
+
#audio_total_length_samples = sum(len(audio_file["array"]) for audio_file in dataset_hf["test"]["audio"])
|
68 |
+
audio_length_split_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
69 |
+
audio_length_total_seconds += audio_length_split_seconds
|
70 |
+
|
71 |
+
samples_split = len(dataset_hf[split]["audio_duration_seconds"])
|
72 |
+
samples_all += samples_split
|
73 |
+
audio_average_length_seconds = round(audio_length_split_seconds / samples_split,2)
|
74 |
+
out_dict[split] = audio_average_length_seconds
|
75 |
+
#print(split, audio_total_length_hours)
|
76 |
+
# add number of samples for all splits
|
77 |
+
out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
|
78 |
+
return out_dict
|
79 |
+
|
80 |
def speakers_per_split(dataset_hf):
|
81 |
# input - huggingface dataset object
|
82 |
# output - dictionary with statistics about audio duration per split
|
|
|
375 |
return out_dict
|
376 |
|
377 |
|
|
|
378 |
def recordings_per_speaker(dataset_hf):
|
379 |
recordings_per_speaker_stats_dict = {}
|
380 |
|