artificial-styletts2 / upload_styles.py

upload_styles.py

9184afc 2 months ago

8.74 kB

	# https://github.com/audeering/shift/tree/main - MAKE Mimic-3 voice / harvard 1x 4x
	import shutil
	import csv
	import io
	import os
	import typing
	import wave
	import sys
	from mimic3_tts.__main__ import (CommandLineInterfaceState,
	get_args,
	initialize_args,
	initialize_tts,
	# print_voices,
	# process_lines,
	shutdown_tts,
	OutputNaming,
	process_line)
	import time
	import json
	import os
	import numpy as np

	from pathlib import Path
	import audiofile


	# ================================================ LIST OF VOICES
	ROOT_DIR = '/data/dkounadis/mimic3-voices/'
	foreign_voices = []
	english_voices = []
	for lang in os.listdir(ROOT_DIR + 'voices'):

	for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
	if 'en_' in lang:

	try:
	with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
	for spk in f:
	english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
	# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
	except FileNotFoundError:
	english_voices.append(lang + '/' + voice)

	else:

	try:
	with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
	for spk in f:
	foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())

	except FileNotFoundError:
	foreign_voices.append(lang + '/' + voice)
	#
	[print(i) for i in foreign_voices]
	print('\n_______________________________\n')
	[print(i) for i in english_voices]
	# ====================================================== LIST Mimic-3 ALL VOICES
	# list_voices = [
	# 'en_US/m-ailabs_low#mary_ann',
	# 'en_UK/apope_low',
	# 'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
	# # 'ko_KO/kss_low',
	# 'fr_FR/m-ailabs_low#gilles_g_le_blanc',

	# #'human',
	# ] # special - for human we load specific style file - no Mimic3 is run

	# ================================== ====== END INTERFACE





	def process_lines(state: CommandLineInterfaceState, wav_path=None):
	'''MIMIC3 INTERNAL CALL that yields the sigh sound'''

	args = state.args

	result_idx = 0
	print(f'why waitings in the for loop LIN {state.texts=}\n')
	for line in state.texts:
	# print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
	line_voice: typing.Optional[str] = None
	line_id = ""
	line = line.strip()
	# if not line:
	# continue

	if args.output_naming == OutputNaming.ID:
	# Line has the format id\|text instead of just text
	with io.StringIO(line) as line_io:
	reader = csv.reader(line_io, delimiter=args.csv_delimiter)
	row = next(reader)
	line_id, line = row[0], row[-1]
	if args.csv_voice:
	line_voice = row[1]

	process_line(line, state,
	line_id=line_id,
	line_voice=line_voice)
	result_idx += 1
	time.sleep(4)
	# Write combined audio to stdout
	if state.all_audio:
	# _LOGGER.debug("Writing WAV audio to stdout")

	if sys.stdout.isatty() and (not state.args.stdout):
	with io.BytesIO() as wav_io:
	wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
	with wav_file_play:
	wav_file_play.setframerate(state.sample_rate_hz)
	wav_file_play.setsampwidth(state.sample_width_bytes)
	wav_file_play.setnchannels(state.num_channels)
	wav_file_play.writeframes(state.all_audio)

	# play_wav_bytes(state.args, wav_io.getvalue())
	# wav_path = '_direct_call_2.wav'
	with open(wav_path, 'wb') as wav_file:
	wav_file.write(wav_io.getvalue())
	wav_file.seek(0)
	print('\n\n5T', wav_path)
	else:
	print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)

	# -----------------------------------------------------------------------------
	# cat _tmp_ssml.txt \| mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
	# ======================================================================




	for lang, list_voices in [
	['english', english_voices],
	['foreign', foreign_voices]
	]:
	for rate in [1, 4]:




	# # --
	# # assure mimic-3 generator .onnx exists
	# home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
	# Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
	# speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice


	# if (
	# (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
	# (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
	# ):

	# # Copy

	# shutil.copyfile(
	# f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
	# home_voice_dir + 'generator.onnx')
	# # --




	with open('harvard.json', 'r') as f:
	harvard_individual_sentences = json.load(f)['sentences']
	total_audio_mimic3 = []

	ix = 0
	for list_of_10 in harvard_individual_sentences[:4]: # 77
	# text = ' '.join(list_of_10['sentences'])
	for text in list_of_10['sentences']:


	_voice = list_voices[ix % len(list_voices)]
	_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')

	if 'cmu-arctic' in _str:
	_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'

	print(ix, lang, text)



	# Synthesis Mimic-3 then use it as prompt for StyleTTS2

	# MIMIC-3 if _voice is not HUMAN

	_ssml = (
	'<speak>'
	'<prosody volume=\'64\'>'
	f'<prosody rate=\'{rate}\'>'
	f'<voice name=\'{_voice}\'>'
	'<s>'
	f'{text[:-1] + ", .. !!!"}'
	'</s>'
	'</voice>'
	'</prosody>'
	'</prosody>'
	'</speak>'
	)
	with open('_tmp_ssml.txt', 'w') as f:
	f.write(_ssml)


	# ps = subprocess.Popen(f'cat _tmp_ssml.txt \| mimic3 --ssml > {reference_wav}', shell=True)
	# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
	args = get_args()
	args.ssml = True
	args.text = [_ssml] #['aa', 'bb'] #txt
	args.interactive = False
	# args.output_naming = OutputNaming.TIME

	state = CommandLineInterfaceState(args=args)
	initialize_args(state)
	initialize_tts(state)
	# args.texts = [txt] #['aa', 'bb'] #txt
	# state.stdout = '.' #None #'makeme.wav'
	# state.output_dir = '.noopy'
	# state.interactive = False
	# state.output_naming = OutputNaming.TIME
	# # state.ssml = 1234546575
	# state.stdout = True
	# state.tts = True
	style_path = 'tmp1.wav'
	process_lines(state, wav_path=style_path)
	shutdown_tts(state)
	x, fs = audiofile.read(style_path)
	ix += 1
	total_audio_mimic3.append(x)


	# save styletts2 .wav



	total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
	audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050)

	print(total_audio_mimic3.shape, 'LEN\n')