Spaces:
Sleeping
Sleeping
File size: 6,805 Bytes
32fbd07 3533dd6 32fbd07 3533dd6 32fbd07 3533dd6 32fbd07 3533dd6 32fbd07 3533dd6 32fbd07 c3c241a 32fbd07 3533dd6 32fbd07 3533dd6 32fbd07 25f0e74 32fbd07 25f0e74 32fbd07 25f0e74 32fbd07 25f0e74 32fbd07 25f0e74 3533dd6 25f0e74 32fbd07 25f0e74 32fbd07 c3c241a 32fbd07 c3c241a 32fbd07 c3c241a 32fbd07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
import json
from datasets import load_dataset, get_dataset_config_names, Features, Value
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split
#, uniq_utts_per_speaker
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
import argparse
# move to constants
output_dir_plots = "./plots"
output_dir_reports = "./reports"
os.makedirs(output_dir_plots, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)
# get as cmd line args
# read from command line argument
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?")
args = parser.parse_args()
dataset_name = args.dataset
print("Generating reports for dataset: {}".format(dataset_name))
if not (args.no_secret_test_split):
dataset_name_secret = str.join("-", [dataset_name, "secret"])
# check if secret repo exists
print(dataset_name_secret)
try:
dataset_configs_secret = get_dataset_config_names(dataset_name_secret)
except:
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
os.makedirs(output_dir_reports_dataset, exist_ok=True)
# get dataset config names
dataset_configs = get_dataset_config_names(dataset_name)
# initialize output structures
dataset_statistics = {}
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json")
dataset_contents = {}
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")
# specify features to load. Skip loading of audio data
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_gender': Value(dtype='string', id=None)})
for config_name in dataset_configs:
print("Generating stats for {}".format(config_name))
dataset_statistics[config_name] = {}
dataset_contents[config_name] = {}
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
if not (args.no_secret_test_split):
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
else:
dataset_hf_subset_secret = None
#audio content size
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
# text content size
# metrics based on transcriptions (references) - requires reading secret repo for test split
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
# text content derived features
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
# audio content derived features
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
# metadata coverage per subset in percent - speaker accent
dataset_statistics[config_name]["meta_cov_gender"] = meta_cov_per_split(dataset_hf_subset, 'speaker_gender')
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
# speech rate per subset
dataset_statistics[config_name]["meta_dist_gender"] = meta_distribution_text(dataset_hf_subset, 'speaker_gender')
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
# number of words per speaker (min, max, med, avg, std)
# distribution of audio duration per subset
output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_gender')
# distribution of audio duration per age
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')
# save datasets statistics dict to storage as JSON file
with open(output_fn_stats, 'w') as f:
json.dump(dataset_statistics, f)
# save dataset content analysis to storage
with open(output_fn_contents, 'w') as f:
json.dump(dataset_contents, f)
|