artificial-styletts2 / mimic3_make_harvard_sentences.py

visuals draft

33b0763 3 months ago

21.5 kB

	# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
	# 1. Synthesize via StyleTTS2 --> use same or sweetdreams
	# 2. Run audinterface on this 767
	# 3. .mimic3_pkl .styletts2_pkl -> different durations

	# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
	# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
	import shutil
	import csv
	import io
	import os
	import typing
	import wave
	import sys
	from mimic3_tts.__main__ import (CommandLineInterfaceState,
	get_args,
	initialize_args,
	initialize_tts,
	# print_voices,
	# process_lines,
	shutdown_tts,
	OutputNaming,
	process_line)
	# import msinference
	import time
	import json
	import pandas as pd
	import os
	import numpy as np
	import audonnx
	import audb
	from pathlib import Path
	import transformers
	import torch
	import audmodel
	import audinterface
	import matplotlib.pyplot as plt
	import audiofile


	# ================================================ LIST OF VOICES
	ROOT_DIR = '/data/dkounadis/mimic3-voices/'
	foreign_voices = []
	english_voices = []
	for lang in os.listdir(ROOT_DIR + 'voices'):

	for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
	if 'en_' in lang:

	try:
	with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
	for spk in f:
	english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
	# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
	except FileNotFoundError:
	english_voices.append(lang + '/' + voice)

	else:

	try:
	with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
	for spk in f:
	foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())

	except FileNotFoundError:
	foreign_voices.append(lang + '/' + voice)
	# ================================================== INTERFACE MODELS
	LABELS = [
	'arousal', 'dominance', 'valence',
	# 'speech_synthesizer', 'synthetic_singing',
	'Angry',
	'Sad',
	'Happy',
	'Surprise',
	'Fear',
	'Disgust',
	'Contempt',
	'Neutral'
	]


	config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
	config.dev = torch.device('cuda:0')
	config.dev2 = torch.device('cuda:0')
	def _softmax(x):
	'''x : (batch, num_class)'''
	x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
	x = np.maximum(-100, x)
	x = np.exp(x)
	x /= x.sum(1, keepdims=True)
	return x


	from transformers import AutoModelForAudioClassification
	import types


	def _infer(self, x):
	'''x: (batch, audio-samples-16KHz)'''
	x = (x + self.config.mean) / self.config.std # plus
	x = self.ssl_model(x, attention_mask=None).last_hidden_state
	# pool
	h = self.pool_model.sap_linear(x).tanh()
	w = torch.matmul(h, self.pool_model.attention)
	w = w.softmax(1)
	mu = (x * w).sum(1)
	x = torch.cat(
	[
	mu,
	((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
	], 1)
	return self.ser_model(x)

	teacher_cat = AutoModelForAudioClassification.from_pretrained(
	'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
	trust_remote_code=True # fun definitions see 3loi/SER-.. repo
	).to(config.dev2).eval()
	teacher_cat.forward = types.MethodType(_infer, teacher_cat)



	# Audioset & ADV

	# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
	adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')

	def process_function(x, sampling_rate, idx):
	'''run audioset ct, adv

	USE onnx teachers

	return [synth-speech, synth-singing, 7x, 3x adv] = 11
	'''

	# x = x[None , :] ASaHSuFDCN
	#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
	#4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
	#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
	logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy()
	# USE ALL CATEGORIES
	# --
	# logits_audioset = audioset_model(x, 16000)['logits_sounds']
	# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
	# --
	logits_adv = adv_model(x, 16000)['logits']

	cat = np.concatenate([logits_adv,
	# _sigmoid(logits_audioset),
	_softmax(logits_cat)],
	1)
	print(cat)
	return cat #logits_adv #model(signal, sampling_rate)['logits']

	interface = audinterface.Feature(
	feature_names=LABELS,
	process_func=process_function,
	# process_func_args={'outputs': 'logits_scene'},
	process_func_applies_sliding_window=False,
	win_dur=4.0,
	hop_dur=1.0,
	sampling_rate=16000,
	resample=True,
	verbose=True,
	)
	# ================================== ====== END INTERFACE





















	# Filter insufficient durations - prompt
	foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
	'uk_UK/m-ailabs_low#obruchov',
	'uk_UK/m-ailabs_low#shepel',
	'uk_UK/m-ailabs_low#loboda',
	'uk_UK/m-ailabs_low#miskun',
	'uk_UK/m-ailabs_low#sumska',
	'uk_UK/m-ailabs_low#pysariev',
	]]

	# print(english_voices, '\n_________________________\n', foreign_voices)
	# ----------------------
	# print(foreign_voices.keys(), len(foreign_voices))
	# raise SystemExit


	def process_lines(state: CommandLineInterfaceState, wav_path=None):
	'''MIMIC3 INTERNAL CALL that yields the sigh sound'''

	args = state.args

	result_idx = 0
	print(f'why waitings in the for loop LIN {state.texts=}\n')
	for line in state.texts:
	# print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
	line_voice: typing.Optional[str] = None
	line_id = ""
	line = line.strip()
	# if not line:
	# continue

	if args.output_naming == OutputNaming.ID:
	# Line has the format id\|text instead of just text
	with io.StringIO(line) as line_io:
	reader = csv.reader(line_io, delimiter=args.csv_delimiter)
	row = next(reader)
	line_id, line = row[0], row[-1]
	if args.csv_voice:
	line_voice = row[1]

	process_line(line, state, line_id=line_id, line_voice=line_voice)
	result_idx += 1
	time.sleep(4)
	# Write combined audio to stdout
	if state.all_audio:
	# _LOGGER.debug("Writing WAV audio to stdout")

	if sys.stdout.isatty() and (not state.args.stdout):
	with io.BytesIO() as wav_io:
	wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
	with wav_file_play:
	wav_file_play.setframerate(state.sample_rate_hz)
	wav_file_play.setsampwidth(state.sample_width_bytes)
	wav_file_play.setnchannels(state.num_channels)
	wav_file_play.writeframes(state.all_audio)

	# play_wav_bytes(state.args, wav_io.getvalue())
	# wav_path = '_direct_call_2.wav'
	with open(wav_path, 'wb') as wav_file:
	wav_file.write(wav_io.getvalue())
	wav_file.seek(0)
	print('\n\n5T', wav_path)
	else:
	print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)

	# -----------------------------------------------------------------------------
	# cat _tmp_ssml.txt \| mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
	# ======================================================================





	# END DEF



	# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign

	# STYLES Already Made - HF
	english_dir = 'english_pkl/'
	foreign_dir = 'foreign_pkl/'

	Path(english_dir).mkdir(parents=True, exist_ok=True)
	Path(foreign_dir).mkdir(parents=True, exist_ok=True)



	# # state.output_dir = '.noopy'
	# # state.interactive = False
	# # state.output_naming = OutputNaming.TIME
	# # # state.ssml = 1234546575
	# # state.stdout = True
	# # state.tts = True
	# process_lines(state, wav_path='tmp1.wav')
	# shutdown_tts(state)
	# x, fs = audiofile.read('tmp1.wav')
	# total_audio_mimic3.append(x)
	# print(fs, text, 'mimic3')

	# # MIMIC3 = = = = = = = = = = = = = = END






	# total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
	# audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)

	# total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
	# audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)

	# print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
	# else:
	# print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')








































	# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
	# FOREIGN
	for folder, list_voices in [
	['foreign', foreign_voices],
	['english', english_voices],
	]:
	print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
	for _id, _voice in enumerate(list_voices[:4]):
	_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
	_dir = folder + '_pkl/'
	if 'cmu-arctic' in _str:
	_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'

	print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')

	if (
	not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
	not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
	):

	# Mimic3 GitHub Quota exceded:
	# https://github.com/MycroftAI/mimic3-voices
	# Above repo can exceed download quota of LFS
	# Copy mimic-voices from local copies
	# clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
	# copy to ~/
	#
	#
	home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
	Path(home_voice_dir).mkdir(parents=True, exist_ok=True)


	speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice


	if (
	(not os.path.isfile(home_voice_dir + 'generator.onnx')) or
	(os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
	):

	# Copy

	shutil.copyfile(
	f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
	home_voice_dir + 'generator.onnx')



	# pre made
	prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'












	# ACTUAL TTS


	with open('harvard.json', 'r') as f:
	harvard_individual_sentences = json.load(f)['sentences']
	total_audio_mimic3 = []
	total_audio_stts2 = []
	ix = 0
	for list_of_10 in harvard_individual_sentences[:1]: # 77
	text = ' '.join(list_of_10['sentences'])
	# harvard.append(long_sentence.replace('.', ' '))
	# for text in list_of_10['sentences']:
	style_vec = msinference.compute_style(prompt_path)
	print(ix, text)
	ix += 1


	x = msinference.inference(text,
	style_vec,
	alpha=0.3,
	beta=0.7,
	diffusion_steps=7,
	embedding_scale=1)

	total_audio_stts2.append(x)

	# also synthesize mimic with the same sentence and voice

	# MIMIC-3 = = = = = = = = = = = = = = BEGIN

	rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
	_ssml = (
	'<speak>'
	'<prosody volume=\'64\'>'
	f'<prosody rate=\'{rate}\'>'
	f'<voice name=\'{_voice}\'>'
	'<s>'
	f'{text}'
	'</s>'
	'</voice>'
	'</prosody>'
	'</prosody>'
	'</speak>'
	)
	with open('_tmp_ssml.txt', 'w') as f:
	f.write(_ssml)


	# ps = subprocess.Popen(f'cat _tmp_ssml.txt \| mimic3 --ssml > {reference_wav}', shell=True)
	# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
	args = get_args()
	args.ssml = True
	args.text = [_ssml] #['aa', 'bb'] #txt
	args.interactive = False
	# args.output_naming = OutputNaming.TIME

	state = CommandLineInterfaceState(args=args)
	initialize_args(state)
	initialize_tts(state)
	# args.texts = [txt] #['aa', 'bb'] #txt
	# state.stdout = '.' #None #'makeme.wav'
	# state.output_dir = '.noopy'
	# state.interactive = False
	# state.output_naming = OutputNaming.TIME
	# # state.ssml = 1234546575
	# state.stdout = True
	# state.tts = True
	process_lines(state, wav_path='tmp1.wav')
	shutdown_tts(state)
	x, fs = audiofile.read('tmp1.wav')
	total_audio_mimic3.append(x)
	print(fs, text, 'mimic3')

	# MIMIC3 = = = = = = = = = = = = = = END






	total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
	audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)

	total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
	audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)

	print('Saving:', _dir + 'mimic3__' + _str + '.wav')
	else:
	print('Skip:', _dir + 'styletts2__' + _str + '.wav')


	# AUD I N T E R F A C E
	# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'



	for engine in ['mimic3', 'styletts2']:
	harvard_of_voice = f'{_dir}{engine}__{_str}'
	if not os.path.exists(harvard_of_voice + '.pkl'):
	df = interface.process_file(harvard_of_voice + '.wav')
	df.to_pickle(harvard_of_voice + '.pkl')
	else:
	# df = pd.read_pickle(harvard_of_voice + '.pkl')
	print(harvard_of_voice + '.pkl', 'FOUND')





	# Her we have pkls






	# ===============================================================================
	# V I S U A L S
	#
	# ===============================================================================

	for folder, list_voices in [
	['foreign', foreign_voices],
	['english', english_voices],
	]:
	print(folder, list_voices[:4], '\n\nVISUALIZING VOICES')
	for _id, _voice in enumerate(list_voices[:4]):
	_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
	_dir = folder + '_pkl/'
	if 'cmu-arctic' in _str:
	_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'


	vis_df = {}
	# LOAD PKL
	for engine in ['mimic3', 'styletts2']:
	harvard_of_voice = f'{_dir}{engine}__{_str}'
	if not os.path.exists(harvard_of_voice + '.pkl'):
	df = interface.process_file(harvard_of_voice + '.wav')
	df.to_pickle(harvard_of_voice + '.pkl')
	else:
	df = pd.read_pickle(harvard_of_voice + '.pkl')
	print(harvard_of_voice + '.pkl', 'FOUND')

	vis_df[engine] = df
	SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2']))
	for k,v in vis_df.items():
	p = v[:SHORT] # TRuncate extra segments - human is slower than mimic3

	p.reset_index(inplace= True)
	p.drop(columns=['file','start'], inplace=True)
	p.set_index('end', inplace=True)
	# p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
	p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
	vis_df[k] = p


	print(vis_df, '\n\n\n\n \n')
	# ============ VISUAL ADV cats of styletts2 vs mimic3 same-voice



	fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24),
	gridspec_kw={'hspace': 0, 'wspace': .04})


	# ADV


	time_stamp = vis_df['mimic3'].index.to_numpy()
	for j, dim in enumerate(['arousal',
	'dominance',
	'valence']):

	# MIMIC3

	ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
	color=(0,104/255,139/255),
	label='mean_1',
	linewidth=2)
	ax[j, 0].fill_between(time_stamp,

	vis_df['mimic3'][dim],
	vis_df['styletts2'][dim],

	color=(.2,.2,.2),
	alpha=0.244)
	if j == 0:
	ax[j, 0].legend(['StyleTTS2 style mimic3',
	'StyleTTS2 style crema-d'],
	prop={'size': 10},
	# loc='lower right'
	)
	ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)

	# TICK
	ax[j, 0].set_ylim([1e-7, .9999])
	# ax[j, 0].set_yticks([.25, .5,.75])
	# ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
	ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
	ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])


	ax[j, 0].grid()

	# CATEGORIE





	time_stamp = vis_df['styletts2'].index.to_numpy()
	for j, dim in enumerate(['Angry',
	'Sad',
	'Happy',
	'Surprise',
	'Fear',
	'Disgust',
	'Contempt',
	# 'Neutral'
	]): # ASaHSuFDCN
	j = j + 3 # skip A/D/V suplt

	# MIMIC3

	ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
	color=(0,104/255,139/255),
	label='mean_1',
	linewidth=2)
	ax[j, 0].fill_between(time_stamp,

	vis_df['mimic3'][dim],
	vis_df['styletts2'][dim],

	color=(.2,.2,.2),
	alpha=0.244)
	# ax[j, 0].legend(['StyleTTS2 style mimic3',
	# 'StyleTTS2 style crema-d'],
	# prop={'size': 10},
	# # loc='upper left'
	# )


	ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)

	# TICKS
	ax[j, 0].set_ylim([1e-7, .9999])
	ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
	ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
	ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))


	ax[j, 0].grid()



	plt.savefig(f'bh_{_str}.png', bbox_inches='tight')
	plt.close()

	print('UNCOMMENT msinfereence')