# Synthesize all Harvard Lists - 767 sentences as single .wav # # 1. # # './prompt_mimic3_english/' # # 2. # # './prompt_mimic3_english_4x/' # # 3. # # './prompt_human/' # # 4. # # './prompt_mimic3_foreign/' # # 5. # # './prompt_mimic3_foreign_4x/ # # # ----> THE .wavs will be used for visualisation import soundfile import json import numpy as np import audb from pathlib import Path import os LABELS = ['arousal', 'dominance', 'valence'] def load_human_speech(split=None): DB = [ # [dataset, version, table, has_timdeltas_or_is_full_wavfile] # ['crema-d', '1.1.1', 'emotion.voice.test', False], ['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False], ] output_list = [] for database_name, ver, table, has_timedeltas in DB: a = audb.load(database_name, sampling_rate=16000, format='wav', mixdown=True, version=ver, cache_root='/cache/audb/') a = a[table].get() if has_timedeltas: print(f'{has_timedeltas=}') # a = a.reset_index()[['file', 'start', 'end']] # output_list += [[*t] for t # in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)] else: output_list += [f for f in a.index] # use file (no timedeltas) return output_list # SYNTHESIZE mimic mimicx4 crema-d import msinference prompt_paths = {} with open('harvard.json', 'r') as f: harvard_individual_sentences = json.load(f)['sentences'] for audio_prompt in [#'mimic3_english', #'mimic3_english_4x', 'human', 'mimic3_foreign', 'mimic3_foreign_4x']: if audio_prompt == 'human': prompt_paths = load_human_speech() # better emodb ? else: prompt_dir = '/data/dkounadis/artificial-styletts2/' + audio_prompt + '/' prompt_paths = [prompt_dir + f for f in os.listdir(prompt_dir)] prompt_paths = prompt_paths[:10] print(prompt_paths,'\n\n__________') total_audio = [] ix = 0 for list_of_10 in harvard_individual_sentences[:1]: # long_sentence = ' '.join(list_of_10['sentences']) # harvard.append(long_sentence.replace('.', ' ')) for text in list_of_10['sentences']: style_vec = msinference.compute_style(prompt_paths[ix % len(prompt_paths)]) print(ix, text) ix += 1 x = msinference.inference(text, style_vec, alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1) total_audio.append(x) total_audio = np.concatenate(total_audio) # -- concat 77x lists soundfile.write(f'{audio_prompt}_767_5.wav', total_audio, 24000) print(f'{audio_prompt}_767_5.wav')