artificial-styletts2 / Utils /engineer_style_vectors_v2.py
Dionyssos's picture
add lfs
f7fd0c3
raw
history blame contribute delete
No virus
8.06 kB
from pathlib import Path
import shutil
import csv
import io
import os
import typing
import wave
import sys
from mimic3_tts.__main__ import (CommandLineInterfaceState,
get_args,
initialize_args,
initialize_tts,
# print_voices,
# process_lines,
shutdown_tts,
OutputNaming,
process_line)
def process_lines(state: CommandLineInterfaceState, wav_path=None):
'''MIMIC3 INTERNAL CALL that yields the sigh sound'''
args = state.args
result_idx = 0
print(f'why waitings in the for loop LIN {state.texts=}\n')
for line in state.texts:
print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
line_voice: typing.Optional[str] = None
line_id = ""
line = line.strip()
# if not line:
# continue
if args.output_naming == OutputNaming.ID:
# Line has the format id|text instead of just text
with io.StringIO(line) as line_io:
reader = csv.reader(line_io, delimiter=args.csv_delimiter)
row = next(reader)
line_id, line = row[0], row[-1]
if args.csv_voice:
line_voice = row[1]
process_line(line, state, line_id=line_id, line_voice=line_voice)
result_idx += 1
print('\nARRive at All Audio writing\n\n\n\n')
# -------------------------------------------------------------------------
# Write combined audio to stdout
if state.all_audio:
# _LOGGER.debug("Writing WAV audio to stdout")
if sys.stdout.isatty() and (not state.args.stdout):
with io.BytesIO() as wav_io:
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
with wav_file_play:
wav_file_play.setframerate(state.sample_rate_hz)
wav_file_play.setsampwidth(state.sample_width_bytes)
wav_file_play.setnchannels(state.num_channels)
wav_file_play.writeframes(state.all_audio)
# play_wav_bytes(state.args, wav_io.getvalue())
# wav_path = '_direct_call_2.wav'
with open(wav_path, 'wb') as wav_file:
wav_file.write(wav_io.getvalue())
wav_file.seek(0)
# -----------------------------------------------------------------------------
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
# ======================================================================
out_dir = 'assets/'
reference_wav_directory = 'assets/wavs/style_vector_v2/'
Path(reference_wav_directory).mkdir(parents=True, exist_ok=True)
Path(out_dir).mkdir(parents=True, exist_ok=True)
wav_dir = 'assets/wavs/'
Path(wav_dir).mkdir(parents=True, exist_ok=True)
N_PIX = 11
# =======================================================================
# S T A R T G E N E R A T E png/wav
# =======================================================================
NOISE_SCALE = .667
NOISE_W = .9001 #.8 #.90001 # default .8 in __main__.py @ L697 IGNORED DUE TO ARTEfACTS - FOR NOW USE default
a = [
'p239',
'p236',
'p264',
'p250',
'p259',
'p247',
'p261',
'p263',
'p283',
'p274',
'p286',
'p276',
'p270',
'p281',
'p277',
'p231',
'p238',
'p271',
'p257',
'p273',
'p284',
'p329',
'p361',
'p287',
'p360',
'p374',
'p376',
'p310',
'p304',
'p340',
'p347',
'p330',
'p308',
'p314',
'p317',
'p339',
'p311',
'p294',
'p305',
'p266',
'p335',
'p334',
'p318',
'p323',
'p351',
'p333',
'p313',
'p316',
'p244',
'p307',
'p363',
'p336',
'p312',
'p267',
'p297',
'p275',
'p295',
'p288',
'p258',
'p301',
'p232',
'p292',
'p272',
'p278',
'p280',
'p341',
'p268',
'p298',
'p299',
'p279',
'p285',
'p326',
'p300',
's5',
'p230',
'p254',
'p269',
'p293',
'p252',
'p345',
'p262',
'p243',
'p227',
'p343',
'p255',
'p229',
'p240',
'p248',
'p253',
'p233',
'p228',
'p251',
'p282',
'p246',
'p234',
'p226',
'p260',
'p245',
'p241',
'p303',
'p265',
'p306',
'p237',
'p249',
'p256',
'p302',
'p364',
'p225',
'p362']
print(len(a))
b = []
for row in a:
b.append(f'en_US/vctk_low#{row}')
# print(b)
# 00000000 arctic
a = [
'awb' # comma
'rms',
'slt',
'ksp',
'clb',
'aew',
'bdl',
'lnh',
'jmk',
'rxr',
'fem',
'ljm',
'slp',
'ahw',
'axb',
'aup',
'eey',
'gka',
]
for row in a:
b.append(f'en_US/cmu-arctic_low#{row}')
# HIFItts
a = ['9017',
'6097',
'92']
for row in a:
b.append(f'en_US/hifi-tts_low#{row}')
a = [
'elliot_miller',
'judy_bieber',
'mary_ann']
for row in a:
b.append(f'en_US/m-ailabs_low#{row}')
# LJspeech - single speaker
b.append(f'en_US/ljspeech_low')
# en_UK apope - only speaker
b.append(f'en_UK/apope_low')
all_names = b
VOICES = {}
for _id, _voice in enumerate(all_names):
# If GitHub Quota exceded copy mimic-voices from local copies
#
# https://github.com/MycroftAI/mimic3-voices
#
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
if not os.path.isfile(home_voice_dir + 'generator.onnx'):
shutil.copyfile(
f'/data/dkounadis/cache/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
home_voice_dir + 'generator.onnx') # 'en_US incl. voice
prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
if 'cmu-arctic' in prepare_file:
prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav'
else:
prepare_file = prepare_file + '.wav' # [...cmu-arctic...](....cmu_arctic....wav)
file_true = prepare_file.split('.wav')[0] + '_true_.wav'
file_false = prepare_file.split('.wav')[0] + '_false_.wav'
print(prepare_file, file_false, file_true)
reference_wav = reference_wav_directory + prepare_file
rate = 4 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
_ssml = (
'<speak>'
'<prosody volume=\'64\'>'
f'<prosody rate=\'{rate}\'>'
f'<voice name=\'{_voice}\'>'
'<s>'
'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.'
'</s>'
'</voice>'
'</prosody>'
'</prosody>'
'</speak>'
)
with open('_tmp_ssml.txt', 'w') as f:
f.write(_ssml)
# ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
args = get_args()
args.ssml = True
args.text = [_ssml] #['aa', 'bb'] #txt
args.interactive = False
# args.output_naming = OutputNaming.TIME
state = CommandLineInterfaceState(args=args)
initialize_args(state)
initialize_tts(state)
# args.texts = [txt] #['aa', 'bb'] #txt
# state.stdout = '.' #None #'makeme.wav'
# state.output_dir = '.noopy'
# state.interactive = False
# state.output_naming = OutputNaming.TIME
# # state.ssml = 1234546575
# state.stdout = True
# state.tts = True
process_lines(state, wav_path=reference_wav)
shutdown_tts(state)