dkounadis
/

artificial-styletts2

@@ -21,7 +21,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                  shutdown_tts,
                                  OutputNaming,
                                  process_line)
-import msinference
 import time
 import json
 import pandas as pd
@@ -79,9 +79,9 @@ LABELS = [
             ]
-args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
-args.dev = torch.device('cuda:0')
-args.dev2 = torch.device('cuda:0')
 def _softmax(x):
     '''x : (batch, num_class)'''
     x -= x.max(1, keepdims=True)  # if all -400 then sum(exp(x)) = 0
@@ -114,7 +114,7 @@ def _infer(self, x):
 teacher_cat = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
     trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
-).to(args.dev2).eval()
 teacher_cat.forward = types.MethodType(_infer, teacher_cat)
@@ -136,7 +136,7 @@ def process_function(x, sampling_rate, idx):
     #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
     #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
     #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
-    logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
     # USE ALL CATEGORIES
     # --
     # logits_audioset = audioset_model(x, 16000)['logits_sounds']
@@ -162,7 +162,7 @@ interface = audinterface.Feature(
     resample=True,
     verbose=True,
 )
-# ======================================== END INTERFACE
@@ -272,120 +272,6 @@ Path(foreign_dir).mkdir(parents=True, exist_ok=True)
-# # synth 767
-# for _id, _voice in enumerate(foreign_voices):
-#     _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
-#     if 'cmu-arctic' in _str:
-#         _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
-#     print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
-#     if (
-#         not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
-#         not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
-#     ):
-#         # Mimic3 GitHub Quota exceded:
-#         #    https://github.com/MycroftAI/mimic3-voices
-#         #    Above repo can exceed download quota of LFS
-#         # Copy mimic-voices from local copies
-#         #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
-#         #    copy to ~/
-#         #
-#         #
-#         home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
-#         Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
-#         speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
-#         if (
-#             (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
-#             (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
-#                 ):
-#             # Copy
-#             shutil.copyfile(
-#                 f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
-#                 home_voice_dir + 'generator.onnx')
-#         # pre made
-#         prompt_path  = 'mimic3_foreign_4x/' + _str + '.wav'
-#         # =========================================================================== HARVRAD wav
-#         with open('harvard.json', 'r') as f:
-#             harvard_individual_sentences = json.load(f)['sentences']
-#         total_audio_mimic3 = []
-#         total_audio_stts2 = []
-#         ix = 0
-#         for list_of_10 in harvard_individual_sentences[:1]:  # 77
-#             text = ' '.join(list_of_10['sentences'])
-#             # harvard.append(long_sentence.replace('.', ' '))
-#             # for text in list_of_10['sentences']:
-#             style_vec = msinference.compute_style(prompt_path)
-#             print(ix, text)
-#             ix += 1
-#             x = msinference.inference(text,
-#                                         style_vec,
-#                                         alpha=0.3,
-#                                         beta=0.7,
-#                                         diffusion_steps=7,
-#                                         embedding_scale=1)
-#             total_audio_stts2.append(x)
-#             # also synthesize mimic with the same sentence and voice
-#             # MIMIC-3 = = = = = = = = = = = = = = BEGIN
-#             rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
-#             _ssml = (
-#                 '<speak>'
-#                 '<prosody volume=\'64\'>'
-#                 f'<prosody rate=\'{rate}\'>'
-#                 f'<voice name=\'{_voice}\'>'
-#                 '<s>'
-#                 f'{text}'
-#                 '</s>'
-#                 '</voice>'
-#                 '</prosody>'
-#                 '</prosody>'
-#                 '</speak>'
-#             )
-#             with open('_tmp_ssml.txt', 'w') as f:
-#                 f.write(_ssml)
-#             # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
-#             # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
-#             args = get_args()
-#             args.ssml = True
-#             args.text = [_ssml]  #['aa', 'bb'] #txt
-#             args.interactive = False
-#             # args.output_naming = OutputNaming.TIME
-#             state = CommandLineInterfaceState(args=args)
-#             initialize_args(state)
-#             initialize_tts(state)
-#             # args.texts = [txt] #['aa', 'bb'] #txt
-#             # state.stdout = '.' #None #'makeme.wav'
 #             # state.output_dir = '.noopy'
 #             # state.interactive = False
 #             # state.output_naming = OutputNaming.TIME
@@ -609,10 +495,170 @@ for folder, list_voices in [
         # AUD   I N T E R F A C E
             # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
         for engine in ['mimic3', 'styletts2']:
             harvard_of_voice = f'{_dir}{engine}__{_str}'
             if not os.path.exists(harvard_of_voice + '.pkl'):
-                df_pred = interface.process_file(harvard_of_voice + '.wav')
-                df_pred.to_pickle(harvard_of_voice + '.pkl')
             else:
-                print(harvard_of_voice + '.pkl', 'FOUND')

                                  shutdown_tts,
                                  OutputNaming,
                                  process_line)
+# import msinference
 import time
 import json
 import pandas as pd
             ]
+config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
+config.dev = torch.device('cuda:0')
+config.dev2 = torch.device('cuda:0')
 def _softmax(x):
     '''x : (batch, num_class)'''
     x -= x.max(1, keepdims=True)  # if all -400 then sum(exp(x)) = 0
 teacher_cat = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
     trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
+).to(config.dev2).eval()
 teacher_cat.forward = types.MethodType(_infer, teacher_cat)
     #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
     #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
     #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
+    logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy()
     # USE ALL CATEGORIES
     # --
     # logits_audioset = audioset_model(x, 16000)['logits_sounds']
     resample=True,
     verbose=True,
 )
+# ==================================    ====== END INTERFACE
 #             # state.output_dir = '.noopy'
 #             # state.interactive = False
 #             # state.output_naming = OutputNaming.TIME
         # AUD   I N T E R F A C E
             # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
         for engine in ['mimic3', 'styletts2']:
             harvard_of_voice = f'{_dir}{engine}__{_str}'
             if not os.path.exists(harvard_of_voice + '.pkl'):
+                df = interface.process_file(harvard_of_voice + '.wav')
+                df.to_pickle(harvard_of_voice + '.pkl')
             else:
+                # df = pd.read_pickle(harvard_of_voice + '.pkl')
+                print(harvard_of_voice + '.pkl', 'FOUND')
+                # Her we have pkls
+# ===============================================================================
+# V I S U A L S
+#
+# ===============================================================================
+for folder, list_voices in [
+        ['foreign', foreign_voices],
+        ['english', english_voices],
+            ]:
+    print(folder, list_voices[:4], '\n\nVISUALIZING VOICES')
+    for _id, _voice in enumerate(list_voices[:4]):
+        _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
+        _dir = folder + '_pkl/'
+        if 'cmu-arctic' in _str:
+            _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
+        vis_df = {}
+        # LOAD PKL
+        for engine in ['mimic3', 'styletts2']:
+            harvard_of_voice = f'{_dir}{engine}__{_str}'
+            if not os.path.exists(harvard_of_voice + '.pkl'):
+                df = interface.process_file(harvard_of_voice + '.wav')
+                df.to_pickle(harvard_of_voice + '.pkl')
+            else:
+                df = pd.read_pickle(harvard_of_voice + '.pkl')
+                print(harvard_of_voice + '.pkl', 'FOUND')
+            vis_df[engine] = df
+        SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2']))
+        for k,v in vis_df.items():
+            p = v[:SHORT]  # TRuncate extra segments - human is slower than mimic3
+            p.reset_index(inplace= True)
+            p.drop(columns=['file','start'], inplace=True)
+            p.set_index('end', inplace=True)
+            # p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
+            p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
+            vis_df[k] = p
+        print(vis_df, '\n\n\n\n \n')
+        # ============ VISUAL ADV cats of styletts2 vs mimic3 same-voice
+        fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24),
+                               gridspec_kw={'hspace': 0, 'wspace': .04})
+        # ADV
+        time_stamp = vis_df['mimic3'].index.to_numpy()
+        for j, dim in enumerate(['arousal',
+                                'dominance',
+                                'valence']):
+            # MIMIC3
+            ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
+                        color=(0,104/255,139/255),
+                        label='mean_1',
+                        linewidth=2)
+            ax[j, 0].fill_between(time_stamp,
+                            vis_df['mimic3'][dim],
+                            vis_df['styletts2'][dim],
+                            color=(.2,.2,.2),
+                            alpha=0.244)
+            if j == 0:
+                ax[j, 0].legend(['StyleTTS2 style mimic3',
+                                'StyleTTS2 style crema-d'],
+                                prop={'size': 10},
+                                #  loc='lower right'
+                                )
+            ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
+            # TICK
+            ax[j, 0].set_ylim([1e-7, .9999])
+            # ax[j, 0].set_yticks([.25, .5,.75])
+            # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
+            ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+            ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+            ax[j, 0].grid()
+        # CATEGORIE
+        time_stamp = vis_df['styletts2'].index.to_numpy()
+        for j, dim in enumerate(['Angry',
+                                'Sad',
+                                'Happy',
+                                'Surprise',
+                                'Fear',
+                                'Disgust',
+                                'Contempt',
+                                #  'Neutral'
+                                ]):   # ASaHSuFDCN
+            j = j + 3  # skip A/D/V suplt
+            # MIMIC3
+            ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
+                        color=(0,104/255,139/255),
+                        label='mean_1',
+                        linewidth=2)
+            ax[j, 0].fill_between(time_stamp,
+                            vis_df['mimic3'][dim],
+                            vis_df['styletts2'][dim],
+                            color=(.2,.2,.2),
+                            alpha=0.244)
+            # ax[j, 0].legend(['StyleTTS2 style mimic3',
+            #                  'StyleTTS2 style crema-d'],
+            #                  prop={'size': 10},
+            #                 #  loc='upper left'
+            # )
+            ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
+            # TICKS
+            ax[j, 0].set_ylim([1e-7, .9999])
+            ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+            ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+            ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
+            ax[j, 0].grid()
+        plt.savefig(f'bh_{_str}.png', bbox_inches='tight')
+        plt.close()
+print('UNCOMMENT msinfereence')