# https://github.com/audeering/shift/tree/main - MAKE Mimic-3 voice / harvard 1x 4x import shutil import csv import io import os import typing import wave import sys from mimic3_tts.__main__ import (CommandLineInterfaceState, get_args, initialize_args, initialize_tts, # print_voices, # process_lines, shutdown_tts, OutputNaming, process_line) import time import json import os import numpy as np from pathlib import Path import audiofile # ================================================ LIST OF VOICES ROOT_DIR = '/data/dkounadis/mimic3-voices/' foreign_voices = [] english_voices = [] for lang in os.listdir(ROOT_DIR + 'voices'): for voice in os.listdir(ROOT_DIR + 'voices/' + lang): if 'en_' in lang: try: with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: for spk in f: english_voices.append(lang + '/' + voice + '#' + spk.rstrip()) # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f except FileNotFoundError: english_voices.append(lang + '/' + voice) else: try: with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: for spk in f: foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip()) except FileNotFoundError: foreign_voices.append(lang + '/' + voice) # [print(i) for i in foreign_voices] print('\n_______________________________\n') [print(i) for i in english_voices] # ====================================================== LIST Mimic-3 ALL VOICES # list_voices = [ # 'en_US/m-ailabs_low#mary_ann', # 'en_UK/apope_low', # 'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section # # 'ko_KO/kss_low', # 'fr_FR/m-ailabs_low#gilles_g_le_blanc', # #'human', # ] # special - for human we load specific style file - no Mimic3 is run # ================================== ====== END INTERFACE def process_lines(state: CommandLineInterfaceState, wav_path=None): '''MIMIC3 INTERNAL CALL that yields the sigh sound''' args = state.args result_idx = 0 print(f'why waitings in the for loop LIN {state.texts=}\n') for line in state.texts: # print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts line_voice: typing.Optional[str] = None line_id = "" line = line.strip() # if not line: # continue if args.output_naming == OutputNaming.ID: # Line has the format id|text instead of just text with io.StringIO(line) as line_io: reader = csv.reader(line_io, delimiter=args.csv_delimiter) row = next(reader) line_id, line = row[0], row[-1] if args.csv_voice: line_voice = row[1] process_line(line, state, line_id=line_id, line_voice=line_voice) result_idx += 1 time.sleep(4) # Write combined audio to stdout if state.all_audio: # _LOGGER.debug("Writing WAV audio to stdout") if sys.stdout.isatty() and (not state.args.stdout): with io.BytesIO() as wav_io: wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") with wav_file_play: wav_file_play.setframerate(state.sample_rate_hz) wav_file_play.setsampwidth(state.sample_width_bytes) wav_file_play.setnchannels(state.num_channels) wav_file_play.writeframes(state.all_audio) # play_wav_bytes(state.args, wav_io.getvalue()) # wav_path = '_direct_call_2.wav' with open(wav_path, 'wb') as wav_file: wav_file.write(wav_io.getvalue()) wav_file.seek(0) print('\n\n5T', wav_path) else: print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path) # ----------------------------------------------------------------------------- # cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav # ====================================================================== for lang, list_voices in [ ['english', english_voices], ['foreign', foreign_voices] ]: for rate in [1, 4]: # # -- # # assure mimic-3 generator .onnx exists # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' # Path(home_voice_dir).mkdir(parents=True, exist_ok=True) # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice # if ( # (not os.path.isfile(home_voice_dir + 'generator.onnx')) or # (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header # ): # # Copy # shutil.copyfile( # f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', # home_voice_dir + 'generator.onnx') # # -- with open('harvard.json', 'r') as f: harvard_individual_sentences = json.load(f)['sentences'] total_audio_mimic3 = [] ix = 0 for list_of_10 in harvard_individual_sentences[:4]: # 77 # text = ' '.join(list_of_10['sentences']) for text in list_of_10['sentences']: _voice = list_voices[ix % len(list_voices)] _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') if 'cmu-arctic' in _str: _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav' print(ix, lang, text) # Synthesis Mimic-3 then use it as prompt for StyleTTS2 # MIMIC-3 if _voice is not HUMAN _ssml = ( '' '' f'' f'' '' f'{text[:-1] + ", .. !!!"}' '' '' '' '' '' ) with open('_tmp_ssml.txt', 'w') as f: f.write(_ssml) # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True) # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer args = get_args() args.ssml = True args.text = [_ssml] #['aa', 'bb'] #txt args.interactive = False # args.output_naming = OutputNaming.TIME state = CommandLineInterfaceState(args=args) initialize_args(state) initialize_tts(state) # args.texts = [txt] #['aa', 'bb'] #txt # state.stdout = '.' #None #'makeme.wav' # state.output_dir = '.noopy' # state.interactive = False # state.output_naming = OutputNaming.TIME # # state.ssml = 1234546575 # state.stdout = True # state.tts = True style_path = 'tmp1.wav' process_lines(state, wav_path=style_path) shutdown_tts(state) x, fs = audiofile.read(style_path) ix += 1 total_audio_mimic3.append(x) # save styletts2 .wav total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050) print(total_audio_mimic3.shape, 'LEN\n')