|
|
|
|
|
|
|
|
|
|
|
import json |
|
from tqdm import tqdm |
|
import os |
|
import librosa |
|
|
|
from utils.util import has_existed |
|
|
|
|
|
def get_lines(file): |
|
with open(file, "r") as f: |
|
lines = f.readlines() |
|
lines = [l.strip() for l in lines] |
|
return lines |
|
|
|
|
|
def get_uid2utt(opencpop_path, dataset, dataset_type): |
|
index_count = 0 |
|
total_duration = 0 |
|
|
|
file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type)) |
|
lines = get_lines(file) |
|
|
|
uid2utt = [] |
|
for l in tqdm(lines): |
|
items = l.split("|") |
|
uid = items[0] |
|
|
|
res = { |
|
"Dataset": dataset, |
|
"index": index_count, |
|
"Singer": "female1", |
|
"Uid": uid, |
|
} |
|
|
|
|
|
audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid)) |
|
res["Path"] = audio_file |
|
|
|
duration = librosa.get_duration(filename=res["Path"]) |
|
res["Duration"] = duration |
|
|
|
uid2utt.append(res) |
|
|
|
index_count = index_count + 1 |
|
total_duration += duration |
|
|
|
return uid2utt, total_duration / 3600 |
|
|
|
|
|
def main(dataset, output_path, dataset_path): |
|
print("-" * 10) |
|
print("Dataset splits for {}...\n".format(dataset)) |
|
|
|
save_dir = os.path.join(output_path, dataset) |
|
opencpop_path = dataset_path |
|
for dataset_type in ["train", "test"]: |
|
output_file = os.path.join(save_dir, "{}.json".format(dataset_type)) |
|
if has_existed(output_file): |
|
continue |
|
|
|
res, hours = get_uid2utt(opencpop_path, dataset, dataset_type) |
|
|
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
with open(output_file, "w") as f: |
|
json.dump(res, f, indent=4, ensure_ascii=False) |
|
|
|
print("{}_{}_hours= {}".format(dataset, dataset_type, hours)) |
|
|