EmotionalIntensityControl / data_preparation.py
AdalAbilbekov's picture
First commit
ae8e1dd
raw
history blame
No virus
4.36 kB
import kaldiio
import os
import librosa
from tqdm import tqdm
import glob
import json
from shutil import copyfile
import pandas as pd
import argparse
from text import _clean_text, symbols
from num2words import num2words
import re
from melspec import mel_spectrogram
import torchaudio
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--data', type=str, required=True, help='path to the emotional dataset')
args = parser.parse_args()
dataset_path = args.data
filelists_path = 'filelists/all_spks/'
feats_scp_file = filelists_path + 'feats.scp'
feats_ark_file = filelists_path + 'feats.ark'
spks = ['1263201035', '805570882', '399172782']
train_files = []
eval_files = []
for spk in spks:
train_files += glob.glob(dataset_path + spk + "/train/*.wav")
eval_files += glob.glob(dataset_path + spk + "/eval/*.wav")
os.makedirs(filelists_path, exist_ok=True)
with open(filelists_path + 'train_utts.txt', 'w', encoding='utf-8') as f:
for wav_path in train_files:
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
f.write(wav_name + '\n')
with open(filelists_path + 'eval_utts.txt', 'w', encoding='utf-8') as f:
for wav_path in eval_files:
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
f.write(wav_name + '\n')
with open(feats_scp_file, 'w') as feats_scp, \
kaldiio.WriteHelper(f'ark,scp:{feats_ark_file},{feats_scp_file}') as writer:
for root, dirs, files in os.walk(dataset_path):
for file in tqdm(files):
if file.endswith('.wav'):
# Get the file name and relative path to the root folder
wav_path = os.path.join(root, file)
rel_path = os.path.relpath(wav_path, dataset_path)
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
signal, rate = torchaudio.load(wav_path)
spec = mel_spectrogram(signal, 1024, 80, 22050, 256,
1024, 0, 8000, center=False).squeeze()
# Write the features to feats.ark and feats.scp
writer[wav_name] = spec
emotions = [os.path.basename(x).split("_")[1] for x in glob.glob(dataset_path + '/**/**/*')]
emotions = sorted(set(emotions))
utt2spk = {}
utt2emo = {}
wavs = glob.glob(dataset_path + '**/**/*.wav')
for wav_path in tqdm(wavs):
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
emotion = emotions.index(wav_name.split("_")[1])
if wav_path.split('/')[-3] == '1263201035':
spk = 0 ## labels should start with 0
elif wav_path.split('/')[-3] == '805570882':
spk = 1
else:
spk = 2
utt2spk[wav_name] = str(spk)
utt2emo[wav_name] = str(emotion)
utt2spk = dict(sorted(utt2spk.items()))
utt2emo = dict(sorted(utt2emo.items()))
with open(filelists_path + 'utt2emo.json', 'w') as fp:
json.dump(utt2emo, fp, indent=4)
with open(filelists_path + 'utt2spk.json', 'w') as fp:
json.dump(utt2spk, fp, indent=4)
txt_files = sorted(glob.glob(dataset_path + '/**/**/*.txt'))
count = 0
txt = []
basenames = []
utt2text = {}
flag = False
with open(filelists_path + 'text', 'w', encoding='utf-8') as write:
for txt_path in txt_files:
basename = os.path.basename(txt_path).replace('.txt', '')
with open(txt_path, 'r', encoding='utf-8') as f:
txt.append(_clean_text(f.read().strip("\n"), cleaner_names=["kazakh_cleaners"]).replace("'", ""))
basenames.append(basename)
output_string = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in txt]
cleaned_txt = []
for t in output_string:
cleaned_txt.append(''.join([s for s in t if s in symbols]))
utt2text = {basenames[i]: cleaned_txt[i] for i in range(len(cleaned_txt))}
utt2text = dict(sorted(utt2text.items()))
vocab = set()
with open(filelists_path + '/text', 'w', encoding='utf-8') as f:
for x, y in utt2text.items():
for c in y: vocab.add(c)
f.write(x + ' ' + y + '\n')