|
|
|
|
|
|
|
|
|
|
|
import os |
|
import glob |
|
import librosa |
|
import json |
|
|
|
from utils.util import has_existed |
|
|
|
|
|
def main(output_path, dataset_path): |
|
print("-" * 10) |
|
print("Preparing training dataset for svcceval...") |
|
|
|
data_dir = os.path.join(dataset_path, "Data") |
|
save_dir = os.path.join(output_path, "svcceval") |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
singer_dict_file = os.path.join(save_dir, "singers.json") |
|
utt2singer_file = os.path.join(save_dir, "utt2singer") |
|
utt2singer = open(utt2singer_file, "w") |
|
|
|
|
|
train = [] |
|
test = [] |
|
singers = [] |
|
for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")): |
|
singer, filename = wav_file.split("/")[-2:] |
|
uid = filename.split(".")[0] |
|
utt = { |
|
"Dataset": "svcceval", |
|
"Singer": singer, |
|
"Uid": "{}_{}".format(singer, uid), |
|
"Path": wav_file, |
|
} |
|
|
|
|
|
duration = librosa.get_duration(filename=wav_file) |
|
utt["Duration"] = duration |
|
|
|
test.append(utt) |
|
|
|
singers.append(singer) |
|
utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"])) |
|
|
|
|
|
unique_singers = list(set(singers)) |
|
unique_singers.sort() |
|
singer_lut = {name: i for i, name in enumerate(unique_singers)} |
|
with open(singer_dict_file, "w") as f: |
|
json.dump(singer_lut, f, indent=4, ensure_ascii=False) |
|
|
|
train_total_duration = sum([utt["Duration"] for utt in train]) |
|
test_total_duration = sum([utt["Duration"] for utt in test]) |
|
|
|
for dataset_type in ["train", "test"]: |
|
output_file = os.path.join(save_dir, "{}.json".format(dataset_type)) |
|
if has_existed(output_file): |
|
continue |
|
|
|
utterances = eval(dataset_type) |
|
utterances = sorted(utterances, key=lambda x: x["Uid"]) |
|
|
|
for i in range(len(utterances)): |
|
utterances[i]["index"] = i |
|
|
|
print("{}: Total size: {}\n".format(dataset_type, len(utterances))) |
|
|
|
|
|
with open(output_file, "w") as f: |
|
json.dump(utterances, f, indent=4, ensure_ascii=False) |
|
|
|
print( |
|
"#Train hours= {}, #Test hours= {}".format( |
|
train_total_duration / 3600, test_total_duration / 3600 |
|
) |
|
) |
|
|