File size: 7,443 Bytes

9b9c715

# create foreign style vectors - to use for english StyleTTS2

# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
from pathlib import Path
import shutil
import csv
import io
import os
import typing
import wave
import sys
from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                 get_args,
                                 initialize_args,
                                 initialize_tts,
                                 # print_voices,
                                 # process_lines,
                                 shutdown_tts,
                                 OutputNaming,
                                 process_line)
import time
ROOT_DIR = '/data/dkounadis/mimic3-voices/'

foreign_voices = []
for lang in os.listdir(ROOT_DIR + 'voices'):
        if 'en_' not in lang:
            for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
                # print('______\n', voice)
                try:
                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
                        foreign_voices += [lang + '/' + voice + '#' + spk.rstrip() for spk in f]
                except FileNotFoundError:
                    # spk = None  # siwis_low [has no speakers]
                    foreign_voices.append(lang + '/' + voice)
                # print(spk)
                # print(os.listdir(ROOT_DIR + 'voices/' + lang + '/' + voice))
# --- Now we have all speakers per voices -- so can we call mimic3 on those perhaps with copyfile
# print(foreign_voices, len(foreign_voices))






# ----------------------
# print(foreign_voices.keys(), len(foreign_voices))
# raise SystemExit


def process_lines(state: CommandLineInterfaceState, wav_path=None):
    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''

    args = state.args

    result_idx = 0
    print(f'why waitings in the for loop LIN {state.texts=}\n')
    for line in state.texts:
        # print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
        line_voice: typing.Optional[str] = None
        line_id = ""
        line = line.strip()
        # if not line:
        #     continue

        if args.output_naming == OutputNaming.ID:
            # Line has the format id|text instead of just text
            with io.StringIO(line) as line_io:
                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
                row = next(reader)
                line_id, line = row[0], row[-1]
                if args.csv_voice:
                    line_voice = row[1]

        process_line(line, state, line_id=line_id, line_voice=line_voice)
        result_idx += 1
    time.sleep(4)
    # Write combined audio to stdout
    if state.all_audio:
        # _LOGGER.debug("Writing WAV audio to stdout")

        if sys.stdout.isatty() and (not state.args.stdout):
            with io.BytesIO() as wav_io:
                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
                with wav_file_play:
                    wav_file_play.setframerate(state.sample_rate_hz)
                    wav_file_play.setsampwidth(state.sample_width_bytes)
                    wav_file_play.setnchannels(state.num_channels)
                    wav_file_play.writeframes(state.all_audio)

                    # play_wav_bytes(state.args, wav_io.getvalue())
                # wav_path = '_direct_call_2.wav'
                with open(wav_path, 'wb') as wav_file:
                    wav_file.write(wav_io.getvalue())
                    wav_file.seek(0)
                    print('\n\nTTSING', wav_path)
    else:
        print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)                    

# -----------------------------------------------------------------------------
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
# ======================================================================

reference_wav_directory = 'style_vectors_speed1_ICASSP/' #out_dir # + '/wavs/style_vector_v2/'
Path(reference_wav_directory).mkdir(parents=True, exist_ok=True)
wav_dir = 'assets/wavs/'
Path(wav_dir).mkdir(parents=True, exist_ok=True)
for _id, _voice in enumerate(foreign_voices):

    # Mimic3 GitHub Quota exceded:
    #    https://github.com/MycroftAI/mimic3-voices
    #    Above repo can exceed download quota of LFS
    # Copy mimic-voices from local copies
    #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
    #    copy to ~/
    # 
    #
    home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
    Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
    speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
    if (
        (not os.path.isfile(home_voice_dir + 'generator.onnx')) or 
        (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
               ):

        # Copy

        shutil.copyfile(
            f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
            home_voice_dir + 'generator.onnx')


    prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
    if 'cmu-arctic' in prepare_file:
        prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav'
    else:
        prepare_file = prepare_file + '.wav' # [...cmu-arctic...](....cmu_arctic....wav) 

    # file_true = prepare_file.split('.wav')[0] + '_true_.wav'
    # file_false = prepare_file.split('.wav')[0] + '_false_.wav'
    # print(prepare_file, file_false, file_true)


    reference_wav = reference_wav_directory + prepare_file
    if not os.path.isfile(reference_wav):
        
        rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
        _ssml = (
            '<speak>'
            '<prosody volume=\'64\'>'
            f'<prosody rate=\'{rate}\'>'
            f'<voice name=\'{_voice}\'>'
            '<s>'
            'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.'
            '</s>'
            '</voice>'
            '</prosody>'
            '</prosody>'
            '</speak>'
        )
        with open('_tmp_ssml.txt', 'w') as f:
            f.write(_ssml)


        # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
        # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
        args = get_args()
        args.ssml = True
        args.text = [_ssml]  #['aa', 'bb'] #txt
        args.interactive = False
        # args.output_naming = OutputNaming.TIME

        state = CommandLineInterfaceState(args=args)
        initialize_args(state)
        initialize_tts(state)
        # args.texts = [txt] #['aa', 'bb'] #txt
        # state.stdout = '.' #None #'makeme.wav'
        # state.output_dir = '.noopy'
        # state.interactive = False
        # state.output_naming = OutputNaming.TIME
        # # state.ssml = 1234546575
        # state.stdout = True
        # state.tts = True
        process_lines(state, wav_path=reference_wav)
        shutdown_tts(state)
        print(os.path.getsize(reference_wav), 'SZ')