|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import soundfile |
|
import json |
|
import numpy as np |
|
import audb |
|
from pathlib import Path |
|
import os |
|
LABELS = ['arousal', 'dominance', 'valence'] |
|
|
|
|
|
def load_human_speech(split=None): |
|
DB = [ |
|
|
|
|
|
['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False], |
|
] |
|
|
|
output_list = [] |
|
for database_name, ver, table, has_timedeltas in DB: |
|
|
|
a = audb.load(database_name, |
|
sampling_rate=16000, |
|
format='wav', |
|
mixdown=True, |
|
version=ver, |
|
cache_root='/cache/audb/') |
|
a = a[table].get() |
|
if has_timedeltas: |
|
print(f'{has_timedeltas=}') |
|
|
|
|
|
|
|
else: |
|
output_list += [f for f in a.index] |
|
return output_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
import msinference |
|
prompt_paths = {} |
|
|
|
with open('harvard.json', 'r') as f: |
|
harvard_individual_sentences = json.load(f)['sentences'] |
|
|
|
for audio_prompt in [ |
|
|
|
'human', |
|
'mimic3_foreign', |
|
'mimic3_foreign_4x']: |
|
|
|
if audio_prompt == 'human': |
|
prompt_paths = load_human_speech() |
|
else: |
|
prompt_dir = '/data/dkounadis/artificial-styletts2/' + audio_prompt + '/' |
|
prompt_paths = [prompt_dir + f for f in os.listdir(prompt_dir)] |
|
prompt_paths = prompt_paths[:10] |
|
print(prompt_paths,'\n\n__________') |
|
|
|
total_audio = [] |
|
ix = 0 |
|
for list_of_10 in harvard_individual_sentences[:1]: |
|
|
|
|
|
for text in list_of_10['sentences']: |
|
style_vec = msinference.compute_style(prompt_paths[ix % len(prompt_paths)]) |
|
print(ix, text) |
|
ix += 1 |
|
x = msinference.inference(text, |
|
style_vec, |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=7, |
|
embedding_scale=1) |
|
|
|
total_audio.append(x) |
|
total_audio = np.concatenate(total_audio) |
|
soundfile.write(f'{audio_prompt}_767_5.wav', total_audio, 24000) |
|
print(f'{audio_prompt}_767_5.wav') |
|
|