File size: 22,201 Bytes

# https://github.com/audeering/shift/tree/main  -- RUN FROM THIS REPO
import shutil
import csv
import io
import os
import typing
import wave
import sys
import audresample
from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                 get_args,
                                 initialize_args,
                                 initialize_tts,
                                 # print_voices,
                                 # process_lines,
                                 shutdown_tts,
                                 OutputNaming,
                                 process_line)
import msinference
import time
import json
import pandas as pd
import os
import numpy as np
import audonnx
import audb
from pathlib import Path
import transformers
import torch
import audmodel
import audinterface
import matplotlib.pyplot as plt
import audiofile


# ================================================ LIST OF VOICES
# ROOT_DIR = '/data/dkounadis/mimic3-voices/'
# foreign_voices = []
# english_voices = []
# for lang in os.listdir(ROOT_DIR + 'voices'):
        
#         for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
#             if 'en_' in lang:

#                 try:
#                     with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
#                         for spk in f:
#                             english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
#                         # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
#                 except FileNotFoundError:
#                     english_voices.append(lang + '/' + voice)

#             else:
                
#                 try:
#                     with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
#                         for spk in f:
#                             foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
                        
#                 except FileNotFoundError:
#                     foreign_voices.append(lang + '/' + voice)
# # 
# [print(i) for i in foreign_voices]
# print('\n_______________________________\n')
# [print(i) for i in english_voices]
# ====================================================== LIST Mimic-3 ALL VOICES
list_voices = [
    'en_US/m-ailabs_low#mary_ann',
    'en_UK/apope_low',
    'de_DE/thorsten-emotion_low#neutral',  # is the 4x really interesting we can just write it in Section
    'human',
    ]  # special - for human we load specific style file - no Mimic3 is run


# ================================================== INTERFACE MODELS
LABELS = [
    'arousal', 'dominance', 'valence',
    # 'speech_synthesizer', 'synthetic_singing',
    'Angry',
    'Sad',
    'Happy',
    'Surprise', 
    'Fear', 
    'Disgust', 
    'Contempt', 
    'Neutral'
            ]


config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
config.dev = torch.device('cuda:0')
config.dev2 = torch.device('cuda:0')
def _softmax(x):
    '''x : (batch, num_class)'''
    x -= x.max(1, keepdims=True)  # if all -400 then sum(exp(x)) = 0
    x = np.maximum(-100, x)
    x = np.exp(x)
    x /= x.sum(1, keepdims=True)
    return x


from transformers import AutoModelForAudioClassification
import types


def _infer(self, x):
    '''x: (batch, audio-samples-16KHz)'''
    x = (x + self.config.mean) / self.config.std  # plus
    x = self.ssl_model(x, attention_mask=None).last_hidden_state
    # pool
    h = self.pool_model.sap_linear(x).tanh()
    w = torch.matmul(h, self.pool_model.attention)
    w = w.softmax(1)
    mu = (x * w).sum(1)
    x = torch.cat(
        [
            mu,
            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
        ], 1)
    return self.ser_model(x)

teacher_cat = AutoModelForAudioClassification.from_pretrained(
    '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
).to(config.dev2).eval()
teacher_cat.forward = types.MethodType(_infer, teacher_cat)



# Audioset & ADV

# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')

def process_function(x, sampling_rate, idx):
    '''run audioset ct, adv

        USE onnx teachers
        
        return [synth-speech, synth-singing, 7x, 3x adv] = 11
    '''
    
    # x = x[None , :]  ASaHSuFDCN
    #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 
    #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
    #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
    logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy()
    # USE ALL CATEGORIES
    # --
    # logits_audioset = audioset_model(x, 16000)['logits_sounds']
    # logits_audioset = logits_audioset[:, [7, 35]]  # speech synthesizer synthetic singing
    # --
    logits_adv = adv_model(x, 16000)['logits']
    
    cat = np.concatenate([logits_adv,
                        #   _sigmoid(logits_audioset),
                            _softmax(logits_cat)],
                            1)
    print(cat)
    return cat #logits_adv #model(signal, sampling_rate)['logits']

interface = audinterface.Feature(
    feature_names=LABELS,
    process_func=process_function,
    # process_func_args={'outputs': 'logits_scene'},
    process_func_applies_sliding_window=False,
    win_dur=7.0,
    hop_dur=4.0,
    sampling_rate=16000,
    resample=True,
    verbose=True,
)
# ==================================    ====== END INTERFACE





def process_lines(state: CommandLineInterfaceState, wav_path=None):
    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''

    args = state.args

    result_idx = 0
    print(f'why waitings in the for loop LIN {state.texts=}\n')
    for line in state.texts:
        # print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
        line_voice: typing.Optional[str] = None
        line_id = ""
        line = line.strip()
        # if not line:
        #     continue

        if args.output_naming == OutputNaming.ID:
            # Line has the format id|text instead of just text
            with io.StringIO(line) as line_io:
                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
                row = next(reader)
                line_id, line = row[0], row[-1]
                if args.csv_voice:
                    line_voice = row[1]

        process_line(line, state, line_id=line_id, line_voice=line_voice)
        result_idx += 1
    time.sleep(4)
    # Write combined audio to stdout
    if state.all_audio:
        # _LOGGER.debug("Writing WAV audio to stdout")

        if sys.stdout.isatty() and (not state.args.stdout):
            with io.BytesIO() as wav_io:
                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
                with wav_file_play:
                    wav_file_play.setframerate(state.sample_rate_hz)
                    wav_file_play.setsampwidth(state.sample_width_bytes)
                    wav_file_play.setnchannels(state.num_channels)
                    wav_file_play.writeframes(state.all_audio)

                    # play_wav_bytes(state.args, wav_io.getvalue())
                # wav_path = '_direct_call_2.wav'
                with open(wav_path, 'wb') as wav_file:
                    wav_file.write(wav_io.getvalue())
                    wav_file.seek(0)
                    print('\n\n5T', wav_path)
    else:
        print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)                    

# -----------------------------------------------------------------------------
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
# ======================================================================





#                      END DEF



# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign

# STYLES Already Made - HF
out_dir = 'out_dir/'
Path(out_dir).mkdir(parents=True, exist_ok=True)



for _id, _voice in enumerate(list_voices):
    _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
    
    if 'cmu-arctic' in _str:
        _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
            
    print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
    
    if (
        not os.path.isfile(out_dir + 'mimic3__' + _str + '.wav') or 
        not os.path.isfile(out_dir + 'styletts2__' + _str + '.wav')
    ):

        # Mimic3 GitHub Quota exceded:
        #    https://github.com/MycroftAI/mimic3-voices
        #    Above repo can exceed download quota of LFS
        # Copy mimic-voices from local copies
        #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
        #    copy to ~/
        # 
        #
        if 'human' not in _voice:
            # assure mimic-3 generator .onnx exists
            home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
            Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
            speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice


            if (
                (not os.path.isfile(home_voice_dir + 'generator.onnx')) or 
                (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
                    ):

                # Copy

                shutil.copyfile(
                    f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
                    home_voice_dir + 'generator.onnx')

        

        
        # prompt_path  = f'mimic3_{folder}_4x/' + _str + '.wav'
        with open('harvard.json', 'r') as f:
            harvard_individual_sentences = json.load(f)['sentences']
        total_audio_mimic3 = []
        total_audio_styletts2 = []
        ix = 0
        for list_of_10 in harvard_individual_sentences[:1]:  # 77
            
            text = ' '.join(list_of_10['sentences'])

            print(ix, text)
            ix += 1


            # Synthesis Mimic-3 then use it as prompt for StyleTTS2

            # MIMIC-3 if _voice is not HUMAN
            if 'human' not in _voice:
                rate = 1
                _ssml = (
                    '<speak>'
                    '<prosody volume=\'64\'>'
                    f'<prosody rate=\'{rate}\'>'
                    f'<voice name=\'{_voice}\'>'
                    '<s>'
                    f'{text}'
                    '</s>'
                    '</voice>'
                    '</prosody>'
                    '</prosody>'
                    '</speak>'
                )
                with open('_tmp_ssml.txt', 'w') as f:
                    f.write(_ssml)


                # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
                # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
                args = get_args()
                args.ssml = True
                args.text = [_ssml]  #['aa', 'bb'] #txt
                args.interactive = False
                # args.output_naming = OutputNaming.TIME

                state = CommandLineInterfaceState(args=args)
                initialize_args(state)
                initialize_tts(state)
                # args.texts = [txt] #['aa', 'bb'] #txt
                # state.stdout = '.' #None #'makeme.wav'
                # state.output_dir = '.noopy'
                # state.interactive = False
                # state.output_naming = OutputNaming.TIME
                # # state.ssml = 1234546575
                # state.stdout = True
                # state.tts = True
                style_path = 'tmp1.wav'
                process_lines(state, wav_path=style_path)
                shutdown_tts(state)
                x, fs = audiofile.read(style_path)
                # print(x.shape)
            else:
                # --
                # MSP['valence.train.votes'].get().sort_values('7').index[-1]
                # style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
                # --
                # MSP['emotion.test-1'].get().sort_values('valence').index[-1]
                style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
                x, fs = audiofile.read(style_path)  # assure is not very short - equl harvard sent len
                print(x.shape,' human')   # crop human to almost mimic-3 duration
            total_audio_mimic3.append(x)
            print(f'{len(total_audio_mimic3)=}')
            print(fs, text, 'mimic3')
            
            # MIMIC3 = = = = = = = = = = = = = = END



            
            style_vec = msinference.compute_style(style_path)  # use mimic-3 as prompt
            


            x = msinference.inference(text,
                                        style_vec,
                                        alpha=0.3,
                                        beta=0.7,
                                        diffusion_steps=7,
                                        embedding_scale=1)
            
            total_audio_styletts2.append(x)
 
        # save styletts2 .wav

        total_audio_styletts2 = np.concatenate(total_audio_styletts2) # -- concat 77x lists
        total_audio_styletts2 = audresample.resample(total_audio_styletts2, 
                                                     original_rate=24000,
                                                     target_rate=16000)[0]
        print('RESAMPLEstyletts2', total_audio_styletts2.shape)
        audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_styletts2, 16000)
        # print('Saving:', out_dir + 'styletts2__' + _str + '.wav')

        # save mimic3 or human .wav

        total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
        if 'human' not in _str:
            total_audio_mimic3 = audresample.resample(total_audio_mimic3,
                                                      original_rate=24000, 
                                                      target_rate=16000)[0]
        else:
            print('human is already on 16kHz - MSPpodcst file')
        print('RESAMPLEmimic3', total_audio_mimic3.shape)
        audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)

        print(total_audio_mimic3.shape, total_audio_styletts2.shape, 'LEN OF TOTAL\n')
        # print('Saving:', out_dir + 'mimic3__' + _str + '.wav')

        
    # AUD   I N T E R F A C E
    



    for engine in ['mimic3',
                   'styletts2']:
        harvard_of_voice = f'{out_dir}{engine}__{_str}'
        if not os.path.exists(harvard_of_voice + '.pkl'):
            df = interface.process_file(harvard_of_voice + '.wav')
            df.to_pickle(harvard_of_voice + '.pkl')
            print('\n\n', harvard_of_voice, df,'\n___________________________\n')






             

        
        
raise SystemExit
print('\nVisuals\n')

# ===============================================================================
# V I S U A L S
#
# ===============================================================================
voice_pairs = [
    [list_voices[0], list_voices[1]],
    [list_voices[2], list_voices[3]]
    ]  # special - for human we load specific style file - no Mimic3 is run

# PLot 1   list_voices[0] list_voices[1]
# Plot 2   list_voices[2] list_voices[2]

for vox1, vox2 in voice_pairs:  # 1 figure pro pair

    _str1 = vox1.replace('/', '_').replace('#', '_').replace('_low', '')
    
    if 'cmu-arctic' in _str1:
        _str1 = _str1.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'

    _str2 = vox2.replace('/', '_').replace('#', '_').replace('_low', '')
    
    if 'cmu-arctic' in _str2:
        _str2 = _str2.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'

            
    vis_df = {
        f'mimic3_{_str1}'    : pd.read_pickle(out_dir + 'mimic3__' + _str1 + '.pkl'),
        f'mimic3_{_str2}'    : pd.read_pickle(out_dir + 'mimic3__' + _str2 + '.pkl'),
        f'styletts2_{_str1}' : pd.read_pickle(out_dir + 'styletts2__' + _str1 + '.pkl'),
        f'styletts2_{_str2}' : pd.read_pickle(out_dir + 'styletts2__' + _str2 + '.pkl'),
    }



    
    SHORT_LEN = min([len(v) for k, v in vis_df.items()])  # different TTS durations per voic
    for k,v in vis_df.items():
        p = v[:SHORT_LEN]  # TRuncate extra segments - human is slower than mimic3
        print('\n\n\n\n',k, p)
        p.reset_index(inplace= True)
        p.drop(columns=['file','start'], inplace=True)
        p.set_index('end', inplace=True)
        # p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
        p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
        vis_df[k] = p
    preds = vis_df
    fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24), gridspec_kw={'hspace': 0, 'wspace': .04})


    # ADV - subplots

    time_stamp = preds[f'mimic3_{_str2}'].index.to_numpy()
    for j, dim in enumerate(['arousal', 
                            'dominance', 
                            'valence']):

        # MIMIC3                      

        ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim], 
                    color=(0,104/255,139/255), 
                    label='mean_1', 
                    linewidth=2)
        ax[j, 0].fill_between(time_stamp,

                        preds[f'styletts2_{_str1}'][dim],
                        preds[f'mimic3_{_str1}'][dim],

                        color=(.2,.2,.2), 
                        alpha=0.244)
        if j == 0:                    
            ax[j, 0].legend([f'mimic3_{_str1}',
                            f'StyleTTS2 using {_str1}'], 
                            prop={'size': 10}, 
                            #  loc='lower right'
                            )
        ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
        
        # TICK
        ax[j, 0].set_ylim([1e-7, .9999])
        # ax[j, 0].set_yticks([.25, .5,.75])
        # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
        ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
        ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])


    # MIMIC3   4x speed


        ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim], 
                    color=(0,104/255,139/255), 
                    label='mean_1', 
                    linewidth=2)
        ax[j, 1].fill_between(time_stamp,

                        preds[f'styletts2_{_str2}'][dim],
                        preds[f'mimic3_{_str2}'][dim],

                        color=(.2,.2,.2), 
                        alpha=0.244)
        if j == 0:                    
            ax[j, 1].legend([f'mimic3_{_str2}',
                            f'StyleTTS2 using {_str2}'], 
                            prop={'size': 10}, 
                            #  loc='lower right'
                            )


        ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')



        # TICK
        ax[j, 1].set_ylim([1e-7, .9999])
        # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
        ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
        ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])




        ax[j, 0].grid()
        ax[j, 1].grid()
    # CATEGORIE





    time_stamp = preds[f'mimic3_{_str1}'].index.to_numpy()
    for j, dim in enumerate(['Angry', 
                            'Sad',
                            'Happy',
                            'Surprise', 
                            'Fear',
                            'Disgust', 
                            'Contempt',
                            #  'Neutral'
                            ]):   # ASaHSuFDCN
        j = j + 3  # skip A/D/V suplt                         

        # MIMIC3                      

        ax[j, 0].plot(time_stamp, preds[f'mimic3_{_str1}'][dim], 
                    color=(0,104/255,139/255), 
                    label='mean_1', 
                    linewidth=2)
        ax[j, 0].fill_between(time_stamp,

                        preds[f'mimic3_{_str2}'][dim],
                        preds[f'styletts2_{_str2}'][dim],

                        color=(.2,.2,.2), 
                        alpha=0.244)
        # ax[j, 0].legend(['StyleTTS2 style mimic3',
        #                  'StyleTTS2 style crema-d'], 
        #                  prop={'size': 10}, 
        #                 #  loc='upper left'
        # )


        ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)

        # TICKS
        ax[j, 0].set_ylim([1e-7, .9999])
        ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
        ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
        ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))


    # MIMIC3   4x speed


        ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim], 
                    color=(0,104/255,139/255), 
                    label='mean_1', 
                    linewidth=2)
        ax[j, 1].fill_between(time_stamp,

                        preds[f'mimic3_{_str2}'][dim],
                        preds[f'styletts2_{_str2}'][dim],

                        color=(.2,.2,.2), 
                        alpha=0.244)
        # ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
        #                  'StyleTTS2 style crema-d'], 
        #                  prop={'size': 10},
        #                 #  loc='upper left'
        # )
        ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
        ax[j, 1].set_ylim([1e-7, .999])
        # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
        ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
        ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
        ax[j, 0].grid()
        ax[j, 1].grid()
    plt.savefig(f'pair_{_str1}_{_str2}.png', bbox_inches='tight')
    plt.close()