|
|
|
from pathlib import Path |
|
import shutil |
|
import csv |
|
import io |
|
import os |
|
import typing |
|
import wave |
|
import sys |
|
from mimic3_tts.__main__ import (CommandLineInterfaceState, |
|
get_args, |
|
initialize_args, |
|
initialize_tts, |
|
|
|
|
|
shutdown_tts, |
|
OutputNaming, |
|
process_line) |
|
|
|
|
|
def process_lines(state: CommandLineInterfaceState, wav_path=None): |
|
'''MIMIC3 INTERNAL CALL that yields the sigh sound''' |
|
|
|
args = state.args |
|
|
|
result_idx = 0 |
|
print(f'why waitings in the for loop LIN {state.texts=}\n') |
|
for line in state.texts: |
|
print(f'LIN {line=}\n') |
|
line_voice: typing.Optional[str] = None |
|
line_id = "" |
|
line = line.strip() |
|
|
|
|
|
|
|
if args.output_naming == OutputNaming.ID: |
|
|
|
with io.StringIO(line) as line_io: |
|
reader = csv.reader(line_io, delimiter=args.csv_delimiter) |
|
row = next(reader) |
|
line_id, line = row[0], row[-1] |
|
if args.csv_voice: |
|
line_voice = row[1] |
|
|
|
process_line(line, state, line_id=line_id, line_voice=line_voice) |
|
result_idx += 1 |
|
|
|
print('\nARRive at All Audio writing\n\n\n\n') |
|
|
|
|
|
|
|
if state.all_audio: |
|
|
|
|
|
if sys.stdout.isatty() and (not state.args.stdout): |
|
with io.BytesIO() as wav_io: |
|
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") |
|
with wav_file_play: |
|
wav_file_play.setframerate(state.sample_rate_hz) |
|
wav_file_play.setsampwidth(state.sample_width_bytes) |
|
wav_file_play.setnchannels(state.num_channels) |
|
wav_file_play.writeframes(state.all_audio) |
|
|
|
|
|
|
|
with open(wav_path, 'wb') as wav_file: |
|
wav_file.write(wav_io.getvalue()) |
|
wav_file.seek(0) |
|
|
|
|
|
|
|
|
|
out_dir = 'assets/' |
|
reference_wav_directory = 'assets/wavs/style_vector_v2/' |
|
Path(reference_wav_directory).mkdir(parents=True, exist_ok=True) |
|
Path(out_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
wav_dir = 'assets/wavs/' |
|
Path(wav_dir).mkdir(parents=True, exist_ok=True) |
|
N_PIX = 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
NOISE_SCALE = .667 |
|
NOISE_W = .9001 |
|
|
|
a = [ |
|
'p239', |
|
'p236', |
|
'p264', |
|
'p250', |
|
'p259', |
|
'p247', |
|
'p261', |
|
'p263', |
|
'p283', |
|
'p274', |
|
'p286', |
|
'p276', |
|
'p270', |
|
'p281', |
|
'p277', |
|
'p231', |
|
'p238', |
|
'p271', |
|
'p257', |
|
'p273', |
|
'p284', |
|
'p329', |
|
'p361', |
|
'p287', |
|
'p360', |
|
'p374', |
|
'p376', |
|
'p310', |
|
'p304', |
|
'p340', |
|
'p347', |
|
'p330', |
|
'p308', |
|
'p314', |
|
'p317', |
|
'p339', |
|
'p311', |
|
'p294', |
|
'p305', |
|
'p266', |
|
'p335', |
|
'p334', |
|
'p318', |
|
'p323', |
|
'p351', |
|
'p333', |
|
'p313', |
|
'p316', |
|
'p244', |
|
'p307', |
|
'p363', |
|
'p336', |
|
'p312', |
|
'p267', |
|
'p297', |
|
'p275', |
|
'p295', |
|
'p288', |
|
'p258', |
|
'p301', |
|
'p232', |
|
'p292', |
|
'p272', |
|
'p278', |
|
'p280', |
|
'p341', |
|
'p268', |
|
'p298', |
|
'p299', |
|
'p279', |
|
'p285', |
|
'p326', |
|
'p300', |
|
's5', |
|
'p230', |
|
'p254', |
|
'p269', |
|
'p293', |
|
'p252', |
|
'p345', |
|
'p262', |
|
'p243', |
|
'p227', |
|
'p343', |
|
'p255', |
|
'p229', |
|
'p240', |
|
'p248', |
|
'p253', |
|
'p233', |
|
'p228', |
|
'p251', |
|
'p282', |
|
'p246', |
|
'p234', |
|
'p226', |
|
'p260', |
|
'p245', |
|
'p241', |
|
'p303', |
|
'p265', |
|
'p306', |
|
'p237', |
|
'p249', |
|
'p256', |
|
'p302', |
|
'p364', |
|
'p225', |
|
'p362'] |
|
|
|
print(len(a)) |
|
|
|
b = [] |
|
|
|
for row in a: |
|
b.append(f'en_US/vctk_low#{row}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
a = [ |
|
'awb' |
|
'rms', |
|
'slt', |
|
'ksp', |
|
'clb', |
|
'aew', |
|
'bdl', |
|
'lnh', |
|
'jmk', |
|
'rxr', |
|
'fem', |
|
'ljm', |
|
'slp', |
|
'ahw', |
|
'axb', |
|
'aup', |
|
'eey', |
|
'gka', |
|
] |
|
|
|
|
|
for row in a: |
|
b.append(f'en_US/cmu-arctic_low#{row}') |
|
|
|
|
|
|
|
a = ['9017', |
|
'6097', |
|
'92'] |
|
|
|
for row in a: |
|
b.append(f'en_US/hifi-tts_low#{row}') |
|
|
|
a = [ |
|
'elliot_miller', |
|
'judy_bieber', |
|
'mary_ann'] |
|
|
|
for row in a: |
|
b.append(f'en_US/m-ailabs_low#{row}') |
|
|
|
|
|
|
|
b.append(f'en_US/ljspeech_low') |
|
|
|
|
|
|
|
b.append(f'en_UK/apope_low') |
|
|
|
all_names = b |
|
|
|
|
|
VOICES = {} |
|
for _id, _voice in enumerate(all_names): |
|
|
|
|
|
|
|
|
|
|
|
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' |
|
Path(home_voice_dir).mkdir(parents=True, exist_ok=True) |
|
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice |
|
if not os.path.isfile(home_voice_dir + 'generator.onnx'): |
|
shutil.copyfile( |
|
f'/data/dkounadis/cache/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', |
|
home_voice_dir + 'generator.onnx') |
|
|
|
prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '') |
|
if 'cmu-arctic' in prepare_file: |
|
prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav' |
|
else: |
|
prepare_file = prepare_file + '.wav' |
|
|
|
file_true = prepare_file.split('.wav')[0] + '_true_.wav' |
|
file_false = prepare_file.split('.wav')[0] + '_false_.wav' |
|
print(prepare_file, file_false, file_true) |
|
|
|
|
|
reference_wav = reference_wav_directory + prepare_file |
|
rate = 4 |
|
_ssml = ( |
|
'<speak>' |
|
'<prosody volume=\'64\'>' |
|
f'<prosody rate=\'{rate}\'>' |
|
f'<voice name=\'{_voice}\'>' |
|
'<s>' |
|
'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.' |
|
'</s>' |
|
'</voice>' |
|
'</prosody>' |
|
'</prosody>' |
|
'</speak>' |
|
) |
|
with open('_tmp_ssml.txt', 'w') as f: |
|
f.write(_ssml) |
|
|
|
|
|
|
|
|
|
args = get_args() |
|
args.ssml = True |
|
args.text = [_ssml] |
|
args.interactive = False |
|
|
|
|
|
state = CommandLineInterfaceState(args=args) |
|
initialize_args(state) |
|
initialize_tts(state) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
process_lines(state, wav_path=reference_wav) |
|
shutdown_tts(state) |
|
|