dkounadis
/

artificial-styletts2

@@ -1,10 +1,3 @@
-# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
-# 1. Synthesize                  via StyleTTS2 --> use same or sweetdreams
-# 2. Run audinterface on this 767
-# 3.      .mimic3_pkl   .styletts2_pkl     -> different durations
-# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
-# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
 import shutil
 import csv
 import io
@@ -12,6 +5,7 @@ import os
 import typing
 import wave
 import sys
 from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                  get_args,
                                  initialize_args,
@@ -21,7 +15,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                  shutdown_tts,
                                  OutputNaming,
                                  process_line)
-# import msinference
 import time
 import json
 import pandas as pd
@@ -39,31 +33,44 @@ import audiofile
 # ================================================ LIST OF VOICES
-ROOT_DIR = '/data/dkounadis/mimic3-voices/'
-foreign_voices = []
-english_voices = []
-for lang in os.listdir(ROOT_DIR + 'voices'):
-        for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
-            if 'en_' in lang:
-                try:
-                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
-                        for spk in f:
-                            english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
-                        # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
-                except FileNotFoundError:
-                    english_voices.append(lang + '/' + voice)
-            else:
-                try:
-                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
-                        for spk in f:
-                            foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
-                except FileNotFoundError:
-                    foreign_voices.append(lang + '/' + voice)
 # ================================================== INTERFACE MODELS
 LABELS = [
     'arousal', 'dominance', 'valence',
@@ -156,8 +163,8 @@ interface = audinterface.Feature(
     process_func=process_function,
     # process_func_args={'outputs': 'logits_scene'},
     process_func_applies_sliding_window=False,
-    win_dur=4.0,
-    hop_dur=1.0,
     sampling_rate=16000,
     resample=True,
     verbose=True,
@@ -168,38 +175,6 @@ interface = audinterface.Feature(
-# Filter insufficient durations - prompt
-foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
-                                                         'uk_UK/m-ailabs_low#obruchov',
-                                                         'uk_UK/m-ailabs_low#shepel',
-                                                         'uk_UK/m-ailabs_low#loboda',
-                                                         'uk_UK/m-ailabs_low#miskun',
-                                                         'uk_UK/m-ailabs_low#sumska',
-                                                         'uk_UK/m-ailabs_low#pysariev',
-                                                         ]]
-# print(english_voices, '\n_________________________\n', foreign_voices)
-# ----------------------
-# print(foreign_voices.keys(), len(foreign_voices))
-# raise SystemExit
 def process_lines(state: CommandLineInterfaceState, wav_path=None):
     '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
@@ -264,114 +239,36 @@ def process_lines(state: CommandLineInterfaceState, wav_path=None):
 # https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
 # STYLES Already Made - HF
-english_dir = 'english_pkl/'
-foreign_dir = 'foreign_pkl/'
-Path(english_dir).mkdir(parents=True, exist_ok=True)
-Path(foreign_dir).mkdir(parents=True, exist_ok=True)
-#             # state.output_dir = '.noopy'
-#             # state.interactive = False
-#             # state.output_naming = OutputNaming.TIME
-#             # # state.ssml = 1234546575
-#             # state.stdout = True
-#             # state.tts = True
-#             process_lines(state, wav_path='tmp1.wav')
-#             shutdown_tts(state)
-#             x, fs = audiofile.read('tmp1.wav')
-#             total_audio_mimic3.append(x)
-#             print(fs, text, 'mimic3')
-#             # MIMIC3 = = = = = = = = = = = = = = END
-#         total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
-#         audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
-#         total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
-#         audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
-#         print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
-#     else:
-#         print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')
-# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
-# FOREIGN
-for folder, list_voices in [
-        ['foreign', foreign_voices],
-        ['english', english_voices],
-            ]:
-    print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
-    for _id, _voice in enumerate(list_voices[:4]):
-        _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
-        _dir = folder + '_pkl/'
-        if 'cmu-arctic' in _str:
-            _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
-        print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
-        if (
-            not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
-            not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
-        ):
-            # Mimic3 GitHub Quota exceded:
-            #    https://github.com/MycroftAI/mimic3-voices
-            #    Above repo can exceed download quota of LFS
-            # Copy mimic-voices from local copies
-            #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
-            #    copy to ~/
-            #
-            #
             home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
             Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
             speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
@@ -386,53 +283,28 @@ for folder, list_voices in [
                     f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
                     home_voice_dir + 'generator.onnx')
-            # pre made
-            prompt_path  = f'mimic3_{folder}_4x/' + _str + '.wav'
-            # ACTUAL TTS
-            with open('harvard.json', 'r') as f:
-                harvard_individual_sentences = json.load(f)['sentences']
-            total_audio_mimic3 = []
-            total_audio_stts2 = []
-            ix = 0
-            for list_of_10 in harvard_individual_sentences[:1]:  # 77
-                text = ' '.join(list_of_10['sentences'])
-                # harvard.append(long_sentence.replace('.', ' '))
-                # for text in list_of_10['sentences']:
-                style_vec = msinference.compute_style(prompt_path)
-                print(ix, text)
-                ix += 1
-                x = msinference.inference(text,
-                                            style_vec,
-                                            alpha=0.3,
-                                            beta=0.7,
-                                            diffusion_steps=7,
-                                            embedding_scale=1)
-                total_audio_stts2.append(x)
-                # also synthesize mimic with the same sentence and voice
-                # MIMIC-3 = = = = = = = = = = = = = = BEGIN
-                rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
                 _ssml = (
                     '<speak>'
                     '<prosody volume=\'64\'>'
@@ -472,51 +344,75 @@ for folder, list_voices in [
                 process_lines(state, wav_path='tmp1.wav')
                 shutdown_tts(state)
                 x, fs = audiofile.read('tmp1.wav')
-                total_audio_mimic3.append(x)
-                print(fs, text, 'mimic3')
-                # MIMIC3 = = = = = = = = = = = = = = END
-            total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
-            audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
-            total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
-            audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
-            print('Saving:', _dir + 'mimic3__' + _str + '.wav')
-        else:
-            print('Skip:', _dir + 'styletts2__' + _str + '.wav')
-        # AUD   I N T E R F A C E
-            # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
-        for engine in ['mimic3', 'styletts2']:
-            harvard_of_voice = f'{_dir}{engine}__{_str}'
-            if not os.path.exists(harvard_of_voice + '.pkl'):
-                df = interface.process_file(harvard_of_voice + '.wav')
-                df.to_pickle(harvard_of_voice + '.pkl')
-            else:
-                # df = pd.read_pickle(harvard_of_voice + '.pkl')
-                print(harvard_of_voice + '.pkl', 'FOUND')
-                # Her we have pkls
 # ===============================================================================
 # V I S U A L S

 import shutil
 import csv
 import io
 import typing
 import wave
 import sys
+import audresample
 from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                  get_args,
                                  initialize_args,
                                  shutdown_tts,
                                  OutputNaming,
                                  process_line)
+import msinference
 import time
 import json
 import pandas as pd
 # ================================================ LIST OF VOICES
+# ROOT_DIR = '/data/dkounadis/mimic3-voices/'
+# foreign_voices = []
+# english_voices = []
+# for lang in os.listdir(ROOT_DIR + 'voices'):
+#         for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
+#             if 'en_' in lang:
+#                 try:
+#                     with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+#                         for spk in f:
+#                             english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+#                         # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
+#                 except FileNotFoundError:
+#                     english_voices.append(lang + '/' + voice)
+#             else:
+#                 try:
+#                     with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+#                         for spk in f:
+#                             foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+#                 except FileNotFoundError:
+#                     foreign_voices.append(lang + '/' + voice)
+# #
+# [print(i) for i in foreign_voices]
+# print('\n_______________________________\n')
+# [print(i) for i in english_voices]
+# ====================================================== END PRINT LIST OF VOICES
+list_voices = [
+    'en_US/m-ailabs_low#mary_ann',
+    'en_UK/apope_low',
+    'de_DE/thorsten-emotion_low#neutral',  # is the 4x really interesting we can just write it in Section
+    'human'
+    ]  # special - for human we load specific style file - no Mimic3 is run
 # ================================================== INTERFACE MODELS
 LABELS = [
     'arousal', 'dominance', 'valence',
     process_func=process_function,
     # process_func_args={'outputs': 'logits_scene'},
     process_func_applies_sliding_window=False,
+    win_dur=7.0,
+    hop_dur=4.0,
     sampling_rate=16000,
     resample=True,
     verbose=True,
 def process_lines(state: CommandLineInterfaceState, wav_path=None):
     '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
 # https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
 # STYLES Already Made - HF
+out_dir = 'out_dir/'
+Path(out_dir).mkdir(parents=True, exist_ok=True)
+for _id, _voice in enumerate(list_voices):
+    _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
+    if 'cmu-arctic' in _str:
+        _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
+    print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
+    if (
+        not os.path.isfile(out_dir + 'mimic3__' + _str + '.wav') or
+        not os.path.isfile(out_dir + 'styletts2__' + _str + '.wav')
+    ):
+        # Mimic3 GitHub Quota exceded:
+        #    https://github.com/MycroftAI/mimic3-voices
+        #    Above repo can exceed download quota of LFS
+        # Copy mimic-voices from local copies
+        #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
+        #    copy to ~/
+        #
+        #
+        if 'human' not in _voice:
+            # assure mimic-3 generator .onnx exists
             home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
             Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
             speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
                     f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
                     home_voice_dir + 'generator.onnx')
+        # prompt_path  = f'mimic3_{folder}_4x/' + _str + '.wav'
+        with open('harvard.json', 'r') as f:
+            harvard_individual_sentences = json.load(f)['sentences']
+        total_audio_mimic3 = []
+        total_audio_stts2 = []
+        ix = 0
+        for list_of_10 in harvard_individual_sentences[:1]:  # 77
+            text = ' '.join(list_of_10['sentences'])
+            print(ix, text)
+            ix += 1
+            # Synthesis Mimic-3 then use it as prompt for StyleTTS2
+            # MIMIC-3 if _voice is not HUMAN
+            if 'human' not in _voice:
+                rate = 1
                 _ssml = (
                     '<speak>'
                     '<prosody volume=\'64\'>'
                 process_lines(state, wav_path='tmp1.wav')
                 shutdown_tts(state)
                 x, fs = audiofile.read('tmp1.wav')
+                print(x.shape)
+            else:
+                # MSP['valence.train.votes'].get().sort_values('7').index[-1]
+                human_style = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
+                x, fs = audiofile.read(human_style)
+                print(x.shape,' human')   # crop human to almost mimic-3 duration
+            total_audio_mimic3.append(x)
+            print(fs, text, 'mimic3')
+            # MIMIC3 = = = = = = = = = = = = = = END
+            style_vec = msinference.compute_style('tmp1.wav')  # use mimic-3 as prompt
+            x = msinference.inference(text,
+                                        style_vec,
+                                        alpha=0.3,
+                                        beta=0.7,
+                                        diffusion_steps=7,
+                                        embedding_scale=1)
+            total_audio_stts2.append(x)
+        total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
+        total_audio_stts2 = audresample.resample(total_audio_stts2, original_rate=24000, target_rate=16000)[0]  # for audinterface
+        audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 16000)
+        total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
+        total_audio_mimic3 = audresample.resample(total_audio_mimic3, original_rate=24000, target_rate=16000)[0]
+        audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)
+        print('Saving:', out_dir + 'mimic3__' + _str + '.wav')
+    else:
+        print('Skip:', out_dir + 'styletts2__' + _str + '.wav')
+    # AUD   I N T E R F A C E
+        # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
+    for engine in ['mimic3', 'styletts2']:
+        harvard_of_voice = f'{out_dir}{engine}__{_str}'
+        if not os.path.exists(harvard_of_voice + '.pkl'):
+            df = interface.process_file(harvard_of_voice + '.wav')
+            df.to_pickle(harvard_of_voice + '.pkl')
+        else:
+            # df = pd.read_pickle(harvard_of_voice + '.pkl')
+            print(harvard_of_voice + '.pkl', 'FOUND')
+print('\nVisuals\n')
 # ===============================================================================
 # V I S U A L S