dkounadis
/

artificial-styletts2

speech-emotion-recognition

Model card Files Files and versions Community

artificial-styletts2 / tts_harvard.py

Dionyssos's picture

fx dir

fda2aa0 28 days ago

No virus

3.12 kB

	# Synthesize all Harvard Lists - 767 sentences as single .wav
	#
	# 1.
	#
	# './prompt_mimic3_english/'
	#
	# 2.
	#
	# './prompt_mimic3_english_4x/'
	#
	# 3.
	#
	# './prompt_human/'
	#
	# 4.
	#
	# './prompt_mimic3_foreign/'
	#
	# 5.
	#
	# './prompt_mimic3_foreign_4x/
	#
	#
	# ----> THE .wavs will be used for visualisation

	import soundfile
	import json
	import numpy as np
	import audb
	from pathlib import Path
	import os
	LABELS = ['arousal', 'dominance', 'valence']


	def load_human_speech(split=None):
	DB = [
	# [dataset, version, table, has_timdeltas_or_is_full_wavfile]
	# ['crema-d', '1.1.1', 'emotion.voice.test', False],
	['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False],
	]

	output_list = []
	for database_name, ver, table, has_timedeltas in DB:

	a = audb.load(database_name,
	sampling_rate=16000,
	format='wav',
	mixdown=True,
	version=ver,
	cache_root='/cache/audb/')
	a = a[table].get()
	if has_timedeltas:
	print(f'{has_timedeltas=}')
	# a = a.reset_index()[['file', 'start', 'end']]
	# output_list += [[*t] for t
	# in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
	else:
	output_list += [f for f in a.index] # use file (no timedeltas)
	return output_list





	# SYNTHESIZE mimic mimicx4 crema-d
	import msinference
	prompt_paths = {}

	with open('harvard.json', 'r') as f:
	harvard_individual_sentences = json.load(f)['sentences']

	for audio_prompt in [#'mimic3_english',
	#'mimic3_english_4x',
	'human',
	'mimic3_foreign',
	'mimic3_foreign_4x']:

	if audio_prompt == 'human':
	prompt_paths = load_human_speech() # better emodb ?
	else:
	prompt_dir = '/data/dkounadis/artificial-styletts2/' + audio_prompt + '/'
	prompt_paths = [prompt_dir + f for f in os.listdir(prompt_dir)]
	prompt_paths = prompt_paths[:10]
	print(prompt_paths,'\n\n__________')

	total_audio = []
	ix = 0
	for list_of_10 in harvard_individual_sentences[:1]:
	# long_sentence = ' '.join(list_of_10['sentences'])
	# harvard.append(long_sentence.replace('.', ' '))
	for text in list_of_10['sentences']:
	style_vec = msinference.compute_style(prompt_paths[ix % len(prompt_paths)])
	print(ix, text)
	ix += 1
	x = msinference.inference(text,
	style_vec,
	alpha=0.3,
	beta=0.7,
	diffusion_steps=7,
	embedding_scale=1)

	total_audio.append(x)
	total_audio = np.concatenate(total_audio) # -- concat 77x lists
	soundfile.write(f'{audio_prompt}_767_5.wav', total_audio, 24000)
	print(f'{audio_prompt}_767_5.wav')