artificial-styletts2 / tts_harvard.py
Dionyssos's picture
fx dir
fda2aa0
raw
history blame
3.12 kB
# Synthesize all Harvard Lists - 767 sentences as single .wav
#
# 1.
#
# './prompt_mimic3_english/'
#
# 2.
#
# './prompt_mimic3_english_4x/'
#
# 3.
#
# './prompt_human/'
#
# 4.
#
# './prompt_mimic3_foreign/'
#
# 5.
#
# './prompt_mimic3_foreign_4x/
#
#
# ----> THE .wavs will be used for visualisation
import soundfile
import json
import numpy as np
import audb
from pathlib import Path
import os
LABELS = ['arousal', 'dominance', 'valence']
def load_human_speech(split=None):
DB = [
# [dataset, version, table, has_timdeltas_or_is_full_wavfile]
# ['crema-d', '1.1.1', 'emotion.voice.test', False],
['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False],
]
output_list = []
for database_name, ver, table, has_timedeltas in DB:
a = audb.load(database_name,
sampling_rate=16000,
format='wav',
mixdown=True,
version=ver,
cache_root='/cache/audb/')
a = a[table].get()
if has_timedeltas:
print(f'{has_timedeltas=}')
# a = a.reset_index()[['file', 'start', 'end']]
# output_list += [[*t] for t
# in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
else:
output_list += [f for f in a.index] # use file (no timedeltas)
return output_list
# SYNTHESIZE mimic mimicx4 crema-d
import msinference
prompt_paths = {}
with open('harvard.json', 'r') as f:
harvard_individual_sentences = json.load(f)['sentences']
for audio_prompt in [#'mimic3_english',
#'mimic3_english_4x',
'human',
'mimic3_foreign',
'mimic3_foreign_4x']:
if audio_prompt == 'human':
prompt_paths = load_human_speech() # better emodb ?
else:
prompt_dir = '/data/dkounadis/artificial-styletts2/' + audio_prompt + '/'
prompt_paths = [prompt_dir + f for f in os.listdir(prompt_dir)]
prompt_paths = prompt_paths[:10]
print(prompt_paths,'\n\n__________')
total_audio = []
ix = 0
for list_of_10 in harvard_individual_sentences[:1]:
# long_sentence = ' '.join(list_of_10['sentences'])
# harvard.append(long_sentence.replace('.', ' '))
for text in list_of_10['sentences']:
style_vec = msinference.compute_style(prompt_paths[ix % len(prompt_paths)])
print(ix, text)
ix += 1
x = msinference.inference(text,
style_vec,
alpha=0.3,
beta=0.7,
diffusion_steps=7,
embedding_scale=1)
total_audio.append(x)
total_audio = np.concatenate(total_audio) # -- concat 77x lists
soundfile.write(f'{audio_prompt}_767_5.wav', total_audio, 24000)
print(f'{audio_prompt}_767_5.wav')