Spaces:
Sleeping
Sleeping
import os | |
import json | |
from datasets import load_dataset, get_dataset_config_names, Features, Value | |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split | |
from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split | |
#, uniq_utts_per_speaker | |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split | |
import argparse | |
# move to constants | |
output_dir_plots = "./plots" | |
output_dir_reports = "./reports" | |
os.makedirs(output_dir_plots, exist_ok=True) | |
os.makedirs(output_dir_plots, exist_ok=True) | |
# get as cmd line args | |
# read from command line argument | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for") | |
parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?") | |
args = parser.parse_args() | |
dataset_name = args.dataset | |
print("Generating reports for dataset: {}".format(dataset_name)) | |
if not (args.no_secret_test_split): | |
dataset_name_secret = str.join("-", [dataset_name, "secret"]) | |
# check if secret repo exists | |
print(dataset_name_secret) | |
try: | |
dataset_configs_secret = get_dataset_config_names(dataset_name_secret) | |
except: | |
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret)) | |
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name) | |
os.makedirs(output_dir_reports_dataset, exist_ok=True) | |
# get dataset config names | |
dataset_configs = get_dataset_config_names(dataset_name) | |
# initialize output structures | |
dataset_statistics = {} | |
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json") | |
dataset_contents = {} | |
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json") | |
# specify features to load. Skip loading of audio data | |
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_gender': Value(dtype='string', id=None)}) | |
for config_name in dataset_configs: | |
print("Generating stats for {}".format(config_name)) | |
dataset_statistics[config_name] = {} | |
dataset_contents[config_name] = {} | |
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True) | |
if not (args.no_secret_test_split): | |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True) | |
else: | |
dataset_hf_subset_secret = None | |
#audio content size | |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset) | |
dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset) | |
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset) | |
# text content size | |
# metrics based on transcriptions (references) - requires reading secret repo for test split | |
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
# text content derived features | |
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset) | |
# audio content derived features | |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset) | |
# metadata coverage per subset in percent - speaker accent | |
dataset_statistics[config_name]["meta_cov_gender"] = meta_cov_per_split(dataset_hf_subset, 'speaker_gender') | |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age') | |
# speech rate per subset | |
dataset_statistics[config_name]["meta_dist_gender"] = meta_distribution_text(dataset_hf_subset, 'speaker_gender') | |
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age') | |
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset) | |
# number of words per speaker (min, max, med, avg, std) | |
# distribution of audio duration per subset | |
output_dir_plots_subset = os.path.join(output_dir_plots, config_name) | |
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_gender') | |
# distribution of audio duration per age | |
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age') | |
# save datasets statistics dict to storage as JSON file | |
with open(output_fn_stats, 'w') as f: | |
json.dump(dataset_statistics, f) | |
# save dataset content analysis to storage | |
with open(output_fn_contents, 'w') as f: | |
json.dump(dataset_contents, f) | |