Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import numpy as np | |
import os | |
import tgt | |
def get_alignment(tier, cfg): | |
sample_rate = cfg["sample_rate"] | |
hop_size = cfg["hop_size"] | |
sil_phones = ["sil", "sp", "spn"] | |
phones = [] | |
durations = [] | |
start_time = 0 | |
end_time = 0 | |
end_idx = 0 | |
for t in tier._objects: | |
s, e, p = t.start_time, t.end_time, t.text | |
# Trim leading silences | |
if phones == []: | |
if p in sil_phones: | |
continue | |
else: | |
start_time = s | |
if p not in sil_phones: | |
# For ordinary phones | |
phones.append(p) | |
end_time = e | |
end_idx = len(phones) | |
else: | |
# For silent phones | |
phones.append(p) | |
durations.append( | |
int( | |
np.round(e * sample_rate / hop_size) | |
- np.round(s * sample_rate / hop_size) | |
) | |
) | |
# Trim tailing silences | |
phones = phones[:end_idx] | |
durations = durations[:end_idx] | |
return phones, durations, start_time, end_time | |
def get_duration(utt, wav, cfg): | |
speaker = utt["Singer"] | |
basename = utt["Uid"] | |
dataset = utt["Dataset"] | |
sample_rate = cfg["sample_rate"] | |
# print(cfg.processed_dir, dataset, speaker, basename) | |
wav_path = os.path.join( | |
cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename) | |
) | |
text_path = os.path.join( | |
cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename) | |
) | |
tg_path = os.path.join( | |
cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename) | |
) | |
# Read raw text | |
with open(text_path, "r") as f: | |
raw_text = f.readline().strip("\n") | |
# Get alignments | |
textgrid = tgt.io.read_textgrid(tg_path) | |
phone, duration, start, end = get_alignment( | |
textgrid.get_tier_by_name("phones"), cfg | |
) | |
text = "{" + " ".join(phone) + "}" | |
if start >= end: | |
return None | |
return duration, text, int(sample_rate * start), int(sample_rate * end) | |