|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import soundfile |
|
import json |
|
import numpy as np |
|
import audb |
|
from pathlib import Path |
|
|
|
LABELS = ['arousal', 'dominance', 'valence'] |
|
|
|
|
|
|
|
|
|
def load_speech(split=None): |
|
DB = [ |
|
|
|
['crema-d', '1.1.1', 'emotion.voice.test', False], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
output_list = [] |
|
for database_name, ver, table, has_timedeltas in DB: |
|
|
|
a = audb.load(database_name, |
|
sampling_rate=16000, |
|
format='wav', |
|
mixdown=True, |
|
version=ver, |
|
cache_root='/cache/audb/') |
|
a = a[table].get() |
|
if has_timedeltas: |
|
print(f'{has_timedeltas=}') |
|
|
|
|
|
|
|
else: |
|
output_list += [f for f in a.index] |
|
return output_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('voices.json', 'r') as f: |
|
df = json.load(f)['voices'] |
|
voice_names = [v['voice'] for k,v in df.items()] |
|
synthetic_wav_paths = [] |
|
synthetic_wav_paths_AFFECT = [] |
|
for voice in voice_names: |
|
|
|
synthetic_wav_paths.append( |
|
'assets/wavs/style_vector/' + voice.replace('/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace('_low', '') + '.wav') |
|
synthetic_wav_paths_AFFECT.append( |
|
'assets/wavs/style_vector_v2/' + voice.replace('/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace('_low', '') + '.wav') |
|
|
|
|
|
print(len(synthetic_wav_paths)) |
|
|
|
|
|
natural_wav_paths = load_speech() |
|
|
|
|
|
|
|
import msinference |
|
|
|
|
|
with open('harvard.json', 'r') as f: |
|
harvard_individual_sentences = json.load(f)['sentences'] |
|
|
|
|
|
|
|
|
|
|
|
for audio_prompt in ['mimic3', 'mimic3_speed', 'human']: |
|
total_audio = [] |
|
ix = 0 |
|
for list_of_10 in harvard_individual_sentences: |
|
|
|
|
|
for text in list_of_10['sentences']: |
|
if audio_prompt == 'mimic3': |
|
style_vec = msinference.compute_style( |
|
synthetic_wav_paths[ix % 134]) |
|
elif audio_prompt == 'mimic3_speed': |
|
style_vec = msinference.compute_style( |
|
synthetic_wav_paths_AFFECT[ix % 134]) |
|
elif audio_prompt == 'human': |
|
style_vec = msinference.compute_style( |
|
natural_wav_paths[ix % len(natural_wav_paths)]) |
|
else: |
|
print('unknonw list of style vecto') |
|
print(ix, text) |
|
ix += 1 |
|
x = msinference.inference(text, |
|
style_vec, |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=7, |
|
embedding_scale=1) |
|
|
|
total_audio.append(x) |
|
|
|
|
|
print('_____________________') |
|
|
|
total_audio = np.concatenate(total_audio) |
|
soundfile.write(f'{audio_prompt}_770.wav', total_audio, 24000) |
|
print(f'{audio_prompt}_full_770.wav') |
|
|