File size: 6,805 Bytes
32fbd07
 
 
 
3533dd6
32fbd07
 
 
 
 
 
 
 
 
 
 
 
 
3533dd6
32fbd07
 
 
3533dd6
32fbd07
 
3533dd6
 
32fbd07
3533dd6
32fbd07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c241a
32fbd07
 
 
 
 
 
 
 
3533dd6
 
32fbd07
3533dd6
 
32fbd07
25f0e74
32fbd07
25f0e74
32fbd07
 
25f0e74
32fbd07
 
 
25f0e74
 
 
 
32fbd07
25f0e74
3533dd6
 
 
 
25f0e74
32fbd07
 
25f0e74
 
32fbd07
c3c241a
32fbd07
 
 
c3c241a
32fbd07
 
 
 
 
 
 
c3c241a
32fbd07
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import json
from datasets import load_dataset, get_dataset_config_names, Features, Value
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split
#, uniq_utts_per_speaker
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
import argparse
# move to constants
output_dir_plots = "./plots"
output_dir_reports = "./reports"
os.makedirs(output_dir_plots, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)

# get as cmd line args
# read from command line argument
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?")

args = parser.parse_args()


dataset_name = args.dataset
print("Generating reports for dataset: {}".format(dataset_name))
if not (args.no_secret_test_split):

    dataset_name_secret = str.join("-", [dataset_name, "secret"])

    # check if secret repo exists
    print(dataset_name_secret)
    try:
        dataset_configs_secret = get_dataset_config_names(dataset_name_secret)
    except:
        print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))

output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
os.makedirs(output_dir_reports_dataset, exist_ok=True)

# get dataset config names
dataset_configs = get_dataset_config_names(dataset_name)

# initialize output structures
dataset_statistics = {}
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json")

dataset_contents = {}
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")

# specify features to load. Skip loading of audio data
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_gender': Value(dtype='string', id=None)})

for config_name in dataset_configs:
    print("Generating stats for {}".format(config_name))
    
    dataset_statistics[config_name] = {}
    dataset_contents[config_name] = {}

    dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
    
    if not (args.no_secret_test_split):
        dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
    else:
        dataset_hf_subset_secret = None

    #audio content size
    dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
    dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
    dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)

    # text content size
    # metrics based on transcriptions (references) - requires reading secret repo for test split
    dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)

    # text content derived features
    dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    
    dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"]  = recordings_per_speaker(dataset_hf_subset)

    # audio content derived features
    dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
    dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
    
    # metadata coverage per subset in percent - speaker accent
    dataset_statistics[config_name]["meta_cov_gender"] = meta_cov_per_split(dataset_hf_subset, 'speaker_gender')
    dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')

    # speech rate per subset
    dataset_statistics[config_name]["meta_dist_gender"] = meta_distribution_text(dataset_hf_subset, 'speaker_gender')
    dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')

    # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
    # number of words per speaker (min, max, med, avg, std)

    # distribution of audio duration per subset
    output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
    meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_gender')
    
    # distribution of audio duration per age
    meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')

    
# save datasets statistics dict to storage as JSON file
with open(output_fn_stats, 'w') as f:
    json.dump(dataset_statistics, f)

# save dataset content analysis to storage
with open(output_fn_contents, 'w') as f:
    json.dump(dataset_contents, f)