artificial-styletts2 / tts_harvard.py
Dionyssos's picture
mimic3 prompt scripts
9b9c715
raw
history blame
5.2 kB
# Synthesize all Harvard Lists 77x lists of 10x sentences to single .wav ----- NEEDS TO BE RUN from https://github.com/audeering/shift/
#
# 1. using mimic3 style
# Folder: 'prompt_mimic3/'
# 2. using mimic3 4x accelerated style
# Folder: 'prompt_mimic3speed/'
# 3. using crema-d style
# Folder: 'prompt_human/'
#
# WAVS used from tts_paper_plot.py
import soundfile
import json
import numpy as np
import audb
from pathlib import Path
LABELS = ['arousal', 'dominance', 'valence']
def load_speech(split=None):
DB = [
# [dataset, version, table, has_timdeltas_or_is_full_wavfile]
['crema-d', '1.1.1', 'emotion.voice.test', False],
# ['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False],
# ['entertain-playtestcloud', '1.1.0', 'emotion.categories.train.gold_standard', True],
# ['erik', '2.2.0', 'emotion.categories.train.gold_standard', True],
# ['meld', '1.3.1', 'emotion.categories.train.gold_standard', False],
# ['msppodcast', '5.0.0', 'emotion.categories.train.gold_standard', False], # tandalone bucket because it has gt labels?
# ['myai', '1.0.1', 'emotion.categories.train.gold_standard', False],
# ['casia', None, 'emotion.categories.gold_standard', False],
# ['switchboard-1', None, 'sentiment', True],
# ['swiss-parliament', None, 'segments', True],
# ['argentinian-parliament', None, 'segments', True],
# ['austrian-parliament', None, 'segments', True],
# #'german', --> bundestag
# ['brazilian-parliament', None, 'segments', True],
# ['mexican-parliament', None, 'segments', True],
# ['portuguese-parliament', None, 'segments', True],
# ['spanish-parliament', None, 'segments', True],
# ['chinese-vocal-emotions-liu-pell', None, 'emotion.categories.desired', False],
# peoples-speech slow
# ['peoples-speech', None, 'train-initial', False]
]
output_list = []
for database_name, ver, table, has_timedeltas in DB:
a = audb.load(database_name,
sampling_rate=16000,
format='wav',
mixdown=True,
version=ver,
cache_root='/cache/audb/')
a = a[table].get()
if has_timedeltas:
print(f'{has_timedeltas=}')
# a = a.reset_index()[['file', 'start', 'end']]
# output_list += [[*t] for t
# in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
else:
output_list += [f for f in a.index] # use file (no timedeltas)
return output_list
# Generate 77 wavs
with open('voices.json', 'r') as f:
df = json.load(f)['voices']
voice_names = [v['voice'] for k,v in df.items()]
synthetic_wav_paths = []
synthetic_wav_paths_AFFECT = []
for voice in voice_names:
synthetic_wav_paths.append(
'assets/wavs/style_vector/' + voice.replace('/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace('_low', '') + '.wav')
synthetic_wav_paths_AFFECT.append(
'assets/wavs/style_vector_v2/' + voice.replace('/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace('_low', '') + '.wav')
print(len(synthetic_wav_paths))
natural_wav_paths = load_speech()
# SYNTHESIZE mimic mimicx4 crema-d
import msinference
with open('harvard.json', 'r') as f:
harvard_individual_sentences = json.load(f)['sentences']
for audio_prompt in ['mimic3', 'mimic3_speed', 'human']:
total_audio = []
ix = 0
for list_of_10 in harvard_individual_sentences:
# long_sentence = ' '.join(list_of_10['sentences'])
# harvard.append(long_sentence.replace('.', ' '))
for text in list_of_10['sentences']:
if audio_prompt == 'mimic3':
style_vec = msinference.compute_style(
synthetic_wav_paths[ix % 134])
elif audio_prompt == 'mimic3_speed':
style_vec = msinference.compute_style(
synthetic_wav_paths_AFFECT[ix % 134])
elif audio_prompt == 'human':
style_vec = msinference.compute_style(
natural_wav_paths[ix % len(natural_wav_paths)])
else:
print('unknonw list of style vecto')
print(ix, text)
ix += 1
x = msinference.inference(text,
style_vec,
alpha=0.3,
beta=0.7,
diffusion_steps=7,
embedding_scale=1)
total_audio.append(x)
# concat before write
# -- for 10x sentenctes
print('_____________________')
# -- for 77x lists
total_audio = np.concatenate(total_audio)
soundfile.write(f'{audio_prompt}_770.wav', total_audio, 24000)
print(f'{audio_prompt}_full_770.wav')