Spaces:
Sleeping
Sleeping
File size: 6,051 Bytes
32fbd07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import os
import json
from datasets import load_dataset, get_dataset_config_names, Features, Value
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
from utils import audio_duration_per_split, speakers_per_split, meta_cov_per_split
#, uniq_utts_per_speaker
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
import argparse
# move to constants
output_dir_plots = "./plots"
output_dir_reports = "./reports"
os.makedirs(output_dir_plots, exist_ok=True)
os.makedirs(output_dir_plots, exist_ok=True)
# get as cmd line args
# read from command line argument
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
parser.add_argument("--secret_test_split", default=True, type=bool, help="Should references for test split be retrieved from the secret distribution?")
args = parser.parse_args()
dataset_name = args.dataset
print("Generating reports for dataset: {}".format(dataset_name))
if (args.secret_test_split):
dataset_name_secret = str.join("-", [dataset_name, "secret"])
# check if secret repo exists
print(dataset_name_secret)
try:
dataset_configs_secret = get_dataset_config_names(dataset_name_secret)
except:
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
#dataset_name = "amu-cai/pl-asr-bigos-v2"
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
os.makedirs(output_dir_reports_dataset, exist_ok=True)
# get dataset config names
dataset_configs = get_dataset_config_names(dataset_name)
# initialize output structures
dataset_statistics = {}
output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json")
dataset_contents = {}
output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json")
# specify features to load. Skip loading of audio data
features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_sex': Value(dtype='string', id=None)})
for config_name in dataset_configs:
print("Generating stats for {}".format(config_name))
dataset_statistics[config_name] = {}
dataset_contents[config_name] = {}
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
if(args.secret_test_split):
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
dataset_statistics[config_name]["audio[h]"] = audio_duration_per_split(dataset_hf_subset)
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
# metrics based on transcriptions (references) - requires reading secret repo for test split
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
# metadata coverage per subset in percent - speaker accent
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
# speech rate per subset
dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
dataset_statistics[config_name]["samples_per_spk"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
# number of words per speaker (min, max, med, avg, std)
# distribution of audio duration per subset
output_dir_plots_subset = os.path.join(output_dir_plots, config_name)
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_sex')
# distribution of audio duration per age
meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age')
# save datasets statistics dict to storage as JSON file
with open(output_fn_stats, 'w') as f:
json.dump(dataset_statistics, f)
# save dataset content analysis to storage
with open(output_fn_contents, 'w') as f:
json.dump(dataset_contents, f)
|