# PREREQUISITY # correct_figure.py -> makes analytic.pkl & CER -> per sentence No Audinterface sliding window import pandas as pd import os import numpy as np from pathlib import Path import matplotlib.pyplot as plt import audiofile columns = ['prompt-arousal', 'prompt-dominance', 'prompt-valence', 'prompt-Angry', 'prompt-Sad', 'prompt-Happy', 'prompt-Surprise', 'prompt-Fear', 'prompt-Disgust', 'prompt-Contempt', 'prompt-Neutral', 'styletts2-arousal', 'styletts2-dominance', 'styletts2-valence', 'styletts2-Angry', 'styletts2-Sad', 'styletts2-Happy', 'styletts2-Surprise', 'styletts2-Fear', 'styletts2-Disgust', 'styletts2-Contempt', 'styletts2-Neutral', 'cer-prompt', 'cer-styletts2'] FULL_PKL = ['english_4x_analytic.pkl', 'english_analytic.pkl', 'foreign_4x_analytic.pkl', 'foreign_analytic.pkl', 'human_analytic.pkl'] # ------------------------------------------- LABELS = ['arousal', 'dominance', 'valence', # 'speech_synthesizer', 'synthetic_singing', 'Angry', 'Sad', 'Happy', 'Surprise', 'Fear', 'Disgust', 'Contempt', 'Neutral' ] # https://arxiv.org/pdf/2407.12229 # https://arxiv.org/pdf/2312.05187 # https://arxiv.org/abs/2407.05407 # https://arxiv.org/pdf/2408.06577 # https://arxiv.org/pdf/2309.07405 preds = {} for file_interface in FULL_PKL: y = pd.read_pickle(file_interface) # y = y.rolling(20).mean()[19:] --> avoid when printing character error rate preds[file_interface] = y #.sort_values('styletts2-valence') print(f'\n\n {file_interface}\n_____________________________\n', f"{y['cer-prompt'].mean()=}", f"{y['cer-styletts2'].mean()=}\n\n") # =================================== cER --------------------------- for lang in ['english', 'foreign']: fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24,20.7), gridspec_kw={'hspace': 0, 'wspace': .04}) time_stamp = np.arange(len(preds['english_analytic.pkl'])) _z = np.zeros(len(preds['english_analytic.pkl'])) for j, dim in enumerate(['arousal', 'dominance', 'valence']): # MIMIC3 ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'], color=(0,104/255,139/255), label='mean_1', linewidth=2) ax[j, 0].fill_between(time_stamp, _z, preds['human_analytic.pkl'][f'styletts2-{dim}'], color=(.2,.2,.2), alpha=0.244) if j == 0: if lang == 'english': desc = 'English' else: desc = 'Non-English' ax[j, 0].legend([f'StyleTTS2 using Mimic-3 {desc}', f'StyleTTS2 uising EmoDB'], prop={'size': 14}, ) ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17) # TICK ax[j, 0].set_ylim([1e-7, .9999]) # ax[j, 0].set_yticks([.25, .5,.75]) # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75']) ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()]) ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]]) # MIMIC3 4x speed ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'], color=(0,104/255,139/255), label='mean_1', linewidth=2) ax[j, 1].fill_between(time_stamp, _z, preds['human_analytic.pkl'][f'styletts2-{dim}'], color=(.2,.2,.2), alpha=0.244) if j == 0: if lang == 'english': desc = 'English' else: desc = 'Non-English' ax[j, 1].legend([f'StyleTTS2 using Mimic-3 {desc} 4x speed', f'StyleTTS2 using EmoDB'], prop={'size': 14}, # loc='lower right' ) ax[j, 1].set_xlabel('720 Harvard Sentences') # TICK ax[j, 1].set_ylim([1e-7, .9999]) # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()]) ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()]) ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]]) ax[j, 0].grid() ax[j, 1].grid() # CATEGORIE for j, dim in enumerate(['Angry', 'Sad', 'Happy', # 'Surprise', 'Fear', 'Disgust', # 'Contempt', # 'Neutral' ]): # ASaHSuFDCN j = j + 3 # skip A/D/V suplt # MIMIC3 ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'], color=(0,104/255,139/255), label='mean_1', linewidth=2) ax[j, 0].fill_between(time_stamp, _z, preds['human_analytic.pkl'][f'styletts2-{dim}'], color=(.2,.2,.2), alpha=0.244) # ax[j, 0].legend(['StyleTTS2 style mimic3', # 'StyleTTS2 style crema-d'], # prop={'size': 10}, # # loc='upper left' # ) ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17) # TICKS ax[j, 0].set_ylim([1e-7, .9999]) ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]]) ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()]) ax[j, 0].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2)) # MIMIC3 4x speed ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'], color=(0,104/255,139/255), label='mean_1', linewidth=2) ax[j, 1].fill_between(time_stamp, _z, preds['human_analytic.pkl'][f'styletts2-{dim}'], color=(.2,.2,.2), alpha=0.244) # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed', # 'StyleTTS2 style crema-d'], # prop={'size': 10}, # # loc='upper left' # ) ax[j, 1].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2)) ax[j, 1].set_ylim([1e-7, .9999]) # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()]) ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()]) ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]]) ax[j, 0].grid() ax[j, 1].grid() plt.savefig(f'persentence_{lang}.pdf', bbox_inches='tight') plt.close()