dkounadis
/

artificial-styletts2

@@ -299,7 +299,7 @@ for audio_prompt in ['english',
                      'foreign',
                      'foreign_4x']:   # each of these creates a separate pkl - so outer for
     #
-    data = np.zeros((767, len(LABELS)*2 + 2))  # 720 x LABELS-prompt & LABELS-stts2 & cer-prompt & cer-stts2

                      'foreign',
                      'foreign_4x']:   # each of these creates a separate pkl - so outer for
     #
+    data = np.zeros((770, len(LABELS)*2 + 2))  # 768 x LABELS-prompt & LABELS-stts2 & cer-prompt & cer-stts2

visualize_per_sentence.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# PREREQUISITY
+# correct_figure.py -> makes analytic.pkl & CER -> per sentence No Audinterface sliding window
+import pandas as pd
+import os
+import numpy as np
+from pathlib import Path
+import matplotlib.pyplot as plt
+import audiofile
+columns = ['prompt-arousal',
+           'prompt-dominance',
+           'prompt-valence',
+           'prompt-Angry',
+           'prompt-Sad',
+           'prompt-Happy',
+           'prompt-Surprise',
+           'prompt-Fear',
+           'prompt-Disgust',
+           'prompt-Contempt',
+           'prompt-Neutral',
+           'styletts2-arousal',
+           'styletts2-dominance',
+           'styletts2-valence',
+           'styletts2-Angry',
+           'styletts2-Sad',
+           'styletts2-Happy',
+           'styletts2-Surprise',
+           'styletts2-Fear',
+           'styletts2-Disgust',
+           'styletts2-Contempt',
+           'styletts2-Neutral',
+           'cer-prompt',
+           'cer-styletts2']
+FULL_PKL = ['english_4x_analytic.pkl',
+         'english_analytic.pkl',
+         'foreign_4x_analytic.pkl',
+         'foreign_analytic.pkl',
+         'human_analytic.pkl']
+# -------------------------------------------
+LABELS = ['arousal', 'dominance', 'valence',
+        #    'speech_synthesizer', 'synthetic_singing',
+           'Angry',
+           'Sad',
+           'Happy',
+           'Surprise',
+            'Fear',
+            'Disgust',
+            'Contempt',
+            'Neutral'
+            ]
+    # https://arxiv.org/pdf/2407.12229
+    #  https://arxiv.org/pdf/2312.05187
+    # https://arxiv.org/abs/2407.05407
+    # https://arxiv.org/pdf/2408.06577
+    # https://arxiv.org/pdf/2309.07405
+preds  = {}
+for file_interface in FULL_PKL:
+    y = pd.read_pickle(file_interface)
+    preds[file_interface] = y
+for lang in ['english',
+             'foreign']:
+            fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24,20.7),
+                                   gridspec_kw={'hspace': 0, 'wspace': .04})
+            time_stamp = np.arange(len(preds['english_analytic.pkl']))
+            _z = np.zeros(len(preds['english_analytic.pkl']))
+            for j, dim in enumerate(['arousal', 'dominance', 'valence']):
+                # MIMIC3
+                ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 0].fill_between(time_stamp,
+                                _z,
+                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                if j == 0:
+                    if lang == 'english':
+                        desc = 'English'
+                    else:
+                        desc = 'Non-English'
+                    ax[j, 0].legend([f'StyleTTS2 using Mimic-3 {desc}',
+                                     f'StyleTTS2 uising EmoDB'],
+                                    prop={'size': 14},
+                                    )
+                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
+                # TICK
+                ax[j, 0].set_ylim([1e-7, .9999])
+                # ax[j, 0].set_yticks([.25, .5,.75])
+                # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
+                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+            # MIMIC3   4x speed
+                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 1].fill_between(time_stamp,
+                                _z,
+                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                if j == 0:
+                    if lang == 'english':
+                        desc = 'English'
+                    else:
+                        desc = 'Non-English'
+                    ax[j, 1].legend([f'StyleTTS2 using Mimic-3 {desc} 4x speed',
+                                    f'StyleTTS2 using EmoDB'],
+                                    prop={'size': 14},
+                                    #  loc='lower right'
+                                    )
+                ax[j, 1].set_xlabel('720 Harvard Sentences')
+                # TICK
+                ax[j, 1].set_ylim([1e-7, .9999])
+                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
+                ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].grid()
+                ax[j, 1].grid()
+            # CATEGORIE
+            for j, dim in enumerate(['Angry',
+                                    'Sad',
+                                    'Happy',
+                                    #  'Surprise',
+                                    'Fear',
+                                    'Disgust',
+                                    #  'Contempt',
+                                    #  'Neutral'
+                                    ]):   # ASaHSuFDCN
+                j = j + 3  # skip A/D/V suplt
+                # MIMIC3
+                ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 0].fill_between(time_stamp,
+                                _z,
+                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                # ax[j, 0].legend(['StyleTTS2 style mimic3',
+                #                  'StyleTTS2 style crema-d'],
+                #                  prop={'size': 10},
+                #                 #  loc='upper left'
+                # )
+                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
+                # TICKS
+                ax[j, 0].set_ylim([1e-7, .9999])
+                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 0].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
+            # MIMIC3   4x speed
+                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 1].fill_between(time_stamp,
+                                _z,
+                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                # ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
+                #                  'StyleTTS2 style crema-d'],
+                #                  prop={'size': 10},
+                #                 #  loc='upper left'
+                # )
+                ax[j, 1].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
+                ax[j, 1].set_ylim([1e-7, .9999])
+                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
+                ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
+                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].grid()
+                ax[j, 1].grid()
+            plt.savefig(f'persentence_{lang}.pdf', bbox_inches='tight')
+            plt.close()