dkounadis
/

artificial-styletts2

+# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
+# 1. Synthesize                  via StyleTTS2 --> use same or sweetdreams
+# 2. Run audinterface on this 767
+# 3.      .mimic3_pkl   .styletts2_pkl     -> different durations
+# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
+# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
+import shutil
+import csv
+import io
+import os
+import typing
+import wave
+import sys
+from mimic3_tts.__main__ import (CommandLineInterfaceState,
+                                 get_args,
+                                 initialize_args,
+                                 initialize_tts,
+                                 # print_voices,
+                                 # process_lines,
+                                 shutdown_tts,
+                                 OutputNaming,
+                                 process_line)
+import msinference
+import time
+import json
+import pandas as pd
+import os
+import numpy as np
+import audonnx
+import audb
+from pathlib import Path
+import transformers
+import torch
+import audmodel
+import audinterface
+import matplotlib.pyplot as plt
+import audiofile
+# ================================================ LIST OF VOICES
+ROOT_DIR = '/data/dkounadis/mimic3-voices/'
+foreign_voices = []
+english_voices = []
+for lang in os.listdir(ROOT_DIR + 'voices'):
+        for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
+            if 'en_' in lang:
+                try:
+                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+                        for spk in f:
+                            english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+                        # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
+                except FileNotFoundError:
+                    english_voices.append(lang + '/' + voice)
+            else:
+                try:
+                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
+                        for spk in f:
+                            foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
+                except FileNotFoundError:
+                    foreign_voices.append(lang + '/' + voice)
+# ================================================== INTERFACE MODELS
+LABELS = [
+    'arousal', 'dominance', 'valence',
+    # 'speech_synthesizer', 'synthetic_singing',
+    'Angry',
+    'Sad',
+    'Happy',
+    'Surprise',
+    'Fear',
+    'Disgust',
+    'Contempt',
+    'Neutral'
+            ]
+args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
+args.dev = torch.device('cuda:0')
+args.dev2 = torch.device('cuda:0')
+def _softmax(x):
+    '''x : (batch, num_class)'''
+    x -= x.max(1, keepdims=True)  # if all -400 then sum(exp(x)) = 0
+    x = np.maximum(-100, x)
+    x = np.exp(x)
+    x /= x.sum(1, keepdims=True)
+    return x
+from transformers import AutoModelForAudioClassification
+import types
+def _infer(self, x):
+    '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # plus
+    x = self.ssl_model(x, attention_mask=None).last_hidden_state
+    # pool
+    h = self.pool_model.sap_linear(x).tanh()
+    w = torch.matmul(h, self.pool_model.attention)
+    w = w.softmax(1)
+    mu = (x * w).sum(1)
+    x = torch.cat(
+        [
+            mu,
+            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
+        ], 1)
+    return self.ser_model(x)
+teacher_cat = AutoModelForAudioClassification.from_pretrained(
+    '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
+    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
+).to(args.dev2).eval()
+teacher_cat.forward = types.MethodType(_infer, teacher_cat)
+# Audioset & ADV
+# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
+adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
+def process_function(x, sampling_rate, idx):
+    '''run audioset ct, adv
+        USE onnx teachers
+        return [synth-speech, synth-singing, 7x, 3x adv] = 11
+    '''
+    # x = x[None , :]  ASaHSuFDCN
+    #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
+    #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
+    #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
+    logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
+    # USE ALL CATEGORIES
+    # --
+    # logits_audioset = audioset_model(x, 16000)['logits_sounds']
+    # logits_audioset = logits_audioset[:, [7, 35]]  # speech synthesizer synthetic singing
+    # --
+    logits_adv = adv_model(x, 16000)['logits']
+    cat = np.concatenate([logits_adv,
+                        #   _sigmoid(logits_audioset),
+                            _softmax(logits_cat)],
+                            1)
+    print(cat)
+    return cat #logits_adv #model(signal, sampling_rate)['logits']
+interface = audinterface.Feature(
+    feature_names=LABELS,
+    process_func=process_function,
+    # process_func_args={'outputs': 'logits_scene'},
+    process_func_applies_sliding_window=False,
+    win_dur=4.0,
+    hop_dur=1.0,
+    sampling_rate=16000,
+    resample=True,
+    verbose=True,
+)
+# ======================================== END INTERFACE
+# Filter insufficient durations - prompt
+foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
+                                                         'uk_UK/m-ailabs_low#obruchov',
+                                                         'uk_UK/m-ailabs_low#shepel',
+                                                         'uk_UK/m-ailabs_low#loboda',
+                                                         'uk_UK/m-ailabs_low#miskun',
+                                                         'uk_UK/m-ailabs_low#sumska',
+                                                         'uk_UK/m-ailabs_low#pysariev',
+                                                         ]]
+# print(english_voices, '\n_________________________\n', foreign_voices)
+# ----------------------
+# print(foreign_voices.keys(), len(foreign_voices))
+# raise SystemExit
+def process_lines(state: CommandLineInterfaceState, wav_path=None):
+    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
+    args = state.args
+    result_idx = 0
+    print(f'why waitings in the for loop LIN {state.texts=}\n')
+    for line in state.texts:
+        # print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
+        line_voice: typing.Optional[str] = None
+        line_id = ""
+        line = line.strip()
+        # if not line:
+        #     continue
+        if args.output_naming == OutputNaming.ID:
+            # Line has the format id|text instead of just text
+            with io.StringIO(line) as line_io:
+                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
+                row = next(reader)
+                line_id, line = row[0], row[-1]
+                if args.csv_voice:
+                    line_voice = row[1]
+        process_line(line, state, line_id=line_id, line_voice=line_voice)
+        result_idx += 1
+    time.sleep(4)
+    # Write combined audio to stdout
+    if state.all_audio:
+        # _LOGGER.debug("Writing WAV audio to stdout")
+        if sys.stdout.isatty() and (not state.args.stdout):
+            with io.BytesIO() as wav_io:
+                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
+                with wav_file_play:
+                    wav_file_play.setframerate(state.sample_rate_hz)
+                    wav_file_play.setsampwidth(state.sample_width_bytes)
+                    wav_file_play.setnchannels(state.num_channels)
+                    wav_file_play.writeframes(state.all_audio)
+                    # play_wav_bytes(state.args, wav_io.getvalue())
+                # wav_path = '_direct_call_2.wav'
+                with open(wav_path, 'wb') as wav_file:
+                    wav_file.write(wav_io.getvalue())
+                    wav_file.seek(0)
+                    print('\n\n5T', wav_path)
+    else:
+        print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)
+# -----------------------------------------------------------------------------
+# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
+# ======================================================================
+#                      END DEF
+# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
+# STYLES Already Made - HF
+english_dir = 'english_pkl/'
+foreign_dir = 'foreign_pkl/'
+Path(english_dir).mkdir(parents=True, exist_ok=True)
+Path(foreign_dir).mkdir(parents=True, exist_ok=True)
+# # synth 767
+# for _id, _voice in enumerate(foreign_voices):
+#     _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
+#     if 'cmu-arctic' in _str:
+#         _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
+#     print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
+#     if (
+#         not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
+#         not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
+#     ):
+#         # Mimic3 GitHub Quota exceded:
+#         #    https://github.com/MycroftAI/mimic3-voices
+#         #    Above repo can exceed download quota of LFS
+#         # Copy mimic-voices from local copies
+#         #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
+#         #    copy to ~/
+#         #
+#         #
+#         home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
+#         Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
+#         speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
+#         if (
+#             (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
+#             (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
+#                 ):
+#             # Copy
+#             shutil.copyfile(
+#                 f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
+#                 home_voice_dir + 'generator.onnx')
+#         # pre made
+#         prompt_path  = 'mimic3_foreign_4x/' + _str + '.wav'
+#         # =========================================================================== HARVRAD wav
+#         with open('harvard.json', 'r') as f:
+#             harvard_individual_sentences = json.load(f)['sentences']
+#         total_audio_mimic3 = []
+#         total_audio_stts2 = []
+#         ix = 0
+#         for list_of_10 in harvard_individual_sentences[:1]:  # 77
+#             text = ' '.join(list_of_10['sentences'])
+#             # harvard.append(long_sentence.replace('.', ' '))
+#             # for text in list_of_10['sentences']:
+#             style_vec = msinference.compute_style(prompt_path)
+#             print(ix, text)
+#             ix += 1
+#             x = msinference.inference(text,
+#                                         style_vec,
+#                                         alpha=0.3,
+#                                         beta=0.7,
+#                                         diffusion_steps=7,
+#                                         embedding_scale=1)
+#             total_audio_stts2.append(x)
+#             # also synthesize mimic with the same sentence and voice
+#             # MIMIC-3 = = = = = = = = = = = = = = BEGIN
+#             rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
+#             _ssml = (
+#                 '<speak>'
+#                 '<prosody volume=\'64\'>'
+#                 f'<prosody rate=\'{rate}\'>'
+#                 f'<voice name=\'{_voice}\'>'
+#                 '<s>'
+#                 f'{text}'
+#                 '</s>'
+#                 '</voice>'
+#                 '</prosody>'
+#                 '</prosody>'
+#                 '</speak>'
+#             )
+#             with open('_tmp_ssml.txt', 'w') as f:
+#                 f.write(_ssml)
+#             # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
+#             # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
+#             args = get_args()
+#             args.ssml = True
+#             args.text = [_ssml]  #['aa', 'bb'] #txt
+#             args.interactive = False
+#             # args.output_naming = OutputNaming.TIME
+#             state = CommandLineInterfaceState(args=args)
+#             initialize_args(state)
+#             initialize_tts(state)
+#             # args.texts = [txt] #['aa', 'bb'] #txt
+#             # state.stdout = '.' #None #'makeme.wav'
+#             # state.output_dir = '.noopy'
+#             # state.interactive = False
+#             # state.output_naming = OutputNaming.TIME
+#             # # state.ssml = 1234546575
+#             # state.stdout = True
+#             # state.tts = True
+#             process_lines(state, wav_path='tmp1.wav')
+#             shutdown_tts(state)
+#             x, fs = audiofile.read('tmp1.wav')
+#             total_audio_mimic3.append(x)
+#             print(fs, text, 'mimic3')
+#             # MIMIC3 = = = = = = = = = = = = = = END
+#         total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
+#         audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
+#         total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
+#         audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
+#         print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
+#     else:
+#         print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')
+# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
+# FOREIGN
+for folder, list_voices in [
+        ['foreign', foreign_voices],
+        ['english', english_voices],
+            ]:
+    print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
+    for _id, _voice in enumerate(list_voices[:4]):
+        _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
+        _dir = folder + '_pkl/'
+        if 'cmu-arctic' in _str:
+            _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
+        print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
+        if (
+            not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
+            not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
+        ):
+            # Mimic3 GitHub Quota exceded:
+            #    https://github.com/MycroftAI/mimic3-voices
+            #    Above repo can exceed download quota of LFS
+            # Copy mimic-voices from local copies
+            #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
+            #    copy to ~/
+            #
+            #
+            home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
+            Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
+            speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
+            if (
+                (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
+                (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
+                    ):
+                # Copy
+                shutil.copyfile(
+                    f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
+                    home_voice_dir + 'generator.onnx')
+            # pre made
+            prompt_path  = f'mimic3_{folder}_4x/' + _str + '.wav'
+            # ACTUAL TTS
+            with open('harvard.json', 'r') as f:
+                harvard_individual_sentences = json.load(f)['sentences']
+            total_audio_mimic3 = []
+            total_audio_stts2 = []
+            ix = 0
+            for list_of_10 in harvard_individual_sentences[:1]:  # 77
+                text = ' '.join(list_of_10['sentences'])
+                # harvard.append(long_sentence.replace('.', ' '))
+                # for text in list_of_10['sentences']:
+                style_vec = msinference.compute_style(prompt_path)
+                print(ix, text)
+                ix += 1
+                x = msinference.inference(text,
+                                            style_vec,
+                                            alpha=0.3,
+                                            beta=0.7,
+                                            diffusion_steps=7,
+                                            embedding_scale=1)
+                total_audio_stts2.append(x)
+                # also synthesize mimic with the same sentence and voice
+                # MIMIC-3 = = = = = = = = = = = = = = BEGIN
+                rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
+                _ssml = (
+                    '<speak>'
+                    '<prosody volume=\'64\'>'
+                    f'<prosody rate=\'{rate}\'>'
+                    f'<voice name=\'{_voice}\'>'
+                    '<s>'
+                    f'{text}'
+                    '</s>'
+                    '</voice>'
+                    '</prosody>'
+                    '</prosody>'
+                    '</speak>'
+                )
+                with open('_tmp_ssml.txt', 'w') as f:
+                    f.write(_ssml)
+                # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
+                # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
+                args = get_args()
+                args.ssml = True
+                args.text = [_ssml]  #['aa', 'bb'] #txt
+                args.interactive = False
+                # args.output_naming = OutputNaming.TIME
+                state = CommandLineInterfaceState(args=args)
+                initialize_args(state)
+                initialize_tts(state)
+                # args.texts = [txt] #['aa', 'bb'] #txt
+                # state.stdout = '.' #None #'makeme.wav'
+                # state.output_dir = '.noopy'
+                # state.interactive = False
+                # state.output_naming = OutputNaming.TIME
+                # # state.ssml = 1234546575
+                # state.stdout = True
+                # state.tts = True
+                process_lines(state, wav_path='tmp1.wav')
+                shutdown_tts(state)
+                x, fs = audiofile.read('tmp1.wav')
+                total_audio_mimic3.append(x)
+                print(fs, text, 'mimic3')
+                # MIMIC3 = = = = = = = = = = = = = = END
+            total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
+            audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
+            total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
+            audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
+            print('Saving:', _dir + 'mimic3__' + _str + '.wav')
+        else:
+            print('Skip:', _dir + 'styletts2__' + _str + '.wav')
+        # AUD   I N T E R F A C E
+            # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
+        for engine in ['mimic3', 'styletts2']:
+            harvard_of_voice = f'{_dir}{engine}__{_str}'
+            if not os.path.exists(harvard_of_voice + '.pkl'):
+                df_pred = interface.process_file(harvard_of_voice + '.wav')
+                df_pred.to_pickle(harvard_of_voice + '.pkl')
+            else:
+                print(harvard_of_voice + '.pkl', 'FOUND')