# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
# 1. Synthesize                  via StyleTTS2 --> use same or sweetdreams
# 2. Run audinterface on this 767 
# 3.      .mimic3_pkl   .styletts2_pkl     -> different durations

# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
import shutil
import csv
import io
import os
import typing
import wave
import sys
from mimic3_tts.__main__ import (CommandLineInterfaceState,
                                 get_args,
                                 initialize_args,
                                 initialize_tts,
                                 # print_voices,
                                 # process_lines,
                                 shutdown_tts,
                                 OutputNaming,
                                 process_line)
import msinference
import time
import json
import pandas as pd
import os
import numpy as np
import audonnx
import audb
from pathlib import Path
import transformers
import torch
import audmodel
import audinterface
import matplotlib.pyplot as plt
import audiofile


# ================================================ LIST OF VOICES
ROOT_DIR = '/data/dkounadis/mimic3-voices/'
foreign_voices = []
english_voices = []
for lang in os.listdir(ROOT_DIR + 'voices'):
        
        for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
            if 'en_' in lang:

                try:
                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
                        for spk in f:
                            english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
                        # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
                except FileNotFoundError:
                    english_voices.append(lang + '/' + voice)

            else:
                
                try:
                    with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
                        for spk in f:
                            foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
                        
                except FileNotFoundError:
                    foreign_voices.append(lang + '/' + voice)
# ================================================== INTERFACE MODELS
LABELS = [
    'arousal', 'dominance', 'valence',
    # 'speech_synthesizer', 'synthetic_singing',
    'Angry',
    'Sad',
    'Happy',
    'Surprise', 
    'Fear', 
    'Disgust', 
    'Contempt', 
    'Neutral'
            ]


args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
args.dev = torch.device('cuda:0')
args.dev2 = torch.device('cuda:0')
def _softmax(x):
    '''x : (batch, num_class)'''
    x -= x.max(1, keepdims=True)  # if all -400 then sum(exp(x)) = 0
    x = np.maximum(-100, x)
    x = np.exp(x)
    x /= x.sum(1, keepdims=True)
    return x


from transformers import AutoModelForAudioClassification
import types


def _infer(self, x):
    '''x: (batch, audio-samples-16KHz)'''
    x = (x + self.config.mean) / self.config.std  # plus
    x = self.ssl_model(x, attention_mask=None).last_hidden_state
    # pool
    h = self.pool_model.sap_linear(x).tanh()
    w = torch.matmul(h, self.pool_model.attention)
    w = w.softmax(1)
    mu = (x * w).sum(1)
    x = torch.cat(
        [
            mu,
            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
        ], 1)
    return self.ser_model(x)

teacher_cat = AutoModelForAudioClassification.from_pretrained(
    '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
).to(args.dev2).eval()
teacher_cat.forward = types.MethodType(_infer, teacher_cat)


# Audioset & ADV

# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')

def process_function(x, sampling_rate, idx):
    '''run audioset ct, adv

        USE onnx teachers
        
        return [synth-speech, synth-singing, 7x, 3x adv] = 11
    '''
    
    # x = x[None , :]  ASaHSuFDCN
    #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 
    #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
    #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
    logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
    # USE ALL CATEGORIES
    # --
    # logits_audioset = audioset_model(x, 16000)['logits_sounds']
    # logits_audioset = logits_audioset[:, [7, 35]]  # speech synthesizer synthetic singing
    # --
    logits_adv = adv_model(x, 16000)['logits']
    
    cat = np.concatenate([logits_adv,
                        #   _sigmoid(logits_audioset),
                            _softmax(logits_cat)],
                            1)
    print(cat)
    return cat #logits_adv #model(signal, sampling_rate)['logits']

interface = audinterface.Feature(
    feature_names=LABELS,
    process_func=process_function,
    # process_func_args={'outputs': 'logits_scene'},
    process_func_applies_sliding_window=False,
    win_dur=4.0,
    hop_dur=1.0,
    sampling_rate=16000,
    resample=True,
    verbose=True,
)
# ======================================== END INTERFACE


# Filter insufficient durations - prompt
foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
                                                         'uk_UK/m-ailabs_low#obruchov',
                                                         'uk_UK/m-ailabs_low#shepel',
                                                         'uk_UK/m-ailabs_low#loboda',
                                                         'uk_UK/m-ailabs_low#miskun',
                                                         'uk_UK/m-ailabs_low#sumska',
                                                         'uk_UK/m-ailabs_low#pysariev',
                                                         ]]

# print(english_voices, '\n_________________________\n', foreign_voices)
# ----------------------
# print(foreign_voices.keys(), len(foreign_voices))
# raise SystemExit


def process_lines(state: CommandLineInterfaceState, wav_path=None):
    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''

    args = state.args

    result_idx = 0
    print(f'why waitings in the for loop LIN {state.texts=}\n')
    for line in state.texts:
        # print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
        line_voice: typing.Optional[str] = None
        line_id = ""
        line = line.strip()
        # if not line:
        #     continue

        if args.output_naming == OutputNaming.ID:
            # Line has the format id|text instead of just text
            with io.StringIO(line) as line_io:
                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
                row = next(reader)
                line_id, line = row[0], row[-1]
                if args.csv_voice:
                    line_voice = row[1]

        process_line(line, state, line_id=line_id, line_voice=line_voice)
        result_idx += 1
    time.sleep(4)
    # Write combined audio to stdout
    if state.all_audio:
        # _LOGGER.debug("Writing WAV audio to stdout")

        if sys.stdout.isatty() and (not state.args.stdout):
            with io.BytesIO() as wav_io:
                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
                with wav_file_play:
                    wav_file_play.setframerate(state.sample_rate_hz)
                    wav_file_play.setsampwidth(state.sample_width_bytes)
                    wav_file_play.setnchannels(state.num_channels)
                    wav_file_play.writeframes(state.all_audio)

                    # play_wav_bytes(state.args, wav_io.getvalue())
                # wav_path = '_direct_call_2.wav'
                with open(wav_path, 'wb') as wav_file:
                    wav_file.write(wav_io.getvalue())
                    wav_file.seek(0)
                    print('\n\n5T', wav_path)
    else:
        print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)                    

# -----------------------------------------------------------------------------
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
# ======================================================================


#                      END DEF


# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign

# STYLES Already Made - HF
english_dir = 'english_pkl/'
foreign_dir = 'foreign_pkl/'

Path(english_dir).mkdir(parents=True, exist_ok=True)
Path(foreign_dir).mkdir(parents=True, exist_ok=True)


# # synth 767
# for _id, _voice in enumerate(foreign_voices):
#     _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
#     if 'cmu-arctic' in _str:
#         _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
            
#     print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
    
#     if (
#         not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or 
#         not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
#     ):

#         # Mimic3 GitHub Quota exceded:
#         #    https://github.com/MycroftAI/mimic3-voices
#         #    Above repo can exceed download quota of LFS
#         # Copy mimic-voices from local copies
#         #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
#         #    copy to ~/
#         # 
#         #
#         home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
#         Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
#         speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice


#         if (
#             (not os.path.isfile(home_voice_dir + 'generator.onnx')) or 
#             (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
#                 ):

#             # Copy

#             shutil.copyfile(
#                 f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
#                 home_voice_dir + 'generator.onnx')

        
#         # pre made
#         prompt_path  = 'mimic3_foreign_4x/' + _str + '.wav'


#         # =========================================================================== HARVRAD wav
#         with open('harvard.json', 'r') as f:
#             harvard_individual_sentences = json.load(f)['sentences']
#         total_audio_mimic3 = []
#         total_audio_stts2 = []
#         ix = 0
#         for list_of_10 in harvard_individual_sentences[:1]:  # 77
#             text = ' '.join(list_of_10['sentences'])
#             # harvard.append(long_sentence.replace('.', ' '))
#             # for text in list_of_10['sentences']:
#             style_vec = msinference.compute_style(prompt_path)
#             print(ix, text)
#             ix += 1


#             x = msinference.inference(text,
#                                         style_vec,
#                                         alpha=0.3,
#                                         beta=0.7,
#                                         diffusion_steps=7,
#                                         embedding_scale=1)
            
#             total_audio_stts2.append(x)

#             # also synthesize mimic with the same sentence and voice

#             # MIMIC-3 = = = = = = = = = = = = = = BEGIN

#             rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
#             _ssml = (
#                 '<speak>'
#                 '<prosody volume=\'64\'>'
#                 f'<prosody rate=\'{rate}\'>'
#                 f'<voice name=\'{_voice}\'>'
#                 '<s>'
#                 f'{text}'
#                 '</s>'
#                 '</voice>'
#                 '</prosody>'
#                 '</prosody>'
#                 '</speak>'
#             )
#             with open('_tmp_ssml.txt', 'w') as f:
#                 f.write(_ssml)


#             # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
#             # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
#             args = get_args()
#             args.ssml = True
#             args.text = [_ssml]  #['aa', 'bb'] #txt
#             args.interactive = False
#             # args.output_naming = OutputNaming.TIME

#             state = CommandLineInterfaceState(args=args)
#             initialize_args(state)
#             initialize_tts(state)
#             # args.texts = [txt] #['aa', 'bb'] #txt
#             # state.stdout = '.' #None #'makeme.wav'
#             # state.output_dir = '.noopy'
#             # state.interactive = False
#             # state.output_naming = OutputNaming.TIME
#             # # state.ssml = 1234546575
#             # state.stdout = True
#             # state.tts = True
#             process_lines(state, wav_path='tmp1.wav')
#             shutdown_tts(state)
#             x, fs = audiofile.read('tmp1.wav')
#             total_audio_mimic3.append(x)
#             print(fs, text, 'mimic3')
            
#             # MIMIC3 = = = = = = = = = = = = = = END


#         total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
#         audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)

#         total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
#         audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)

#         print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
#     else:
#         print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')


# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
# FOREIGN
for folder, list_voices in [
        ['foreign', foreign_voices],
        ['english', english_voices],
            ]:
    print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')                           
    for _id, _voice in enumerate(list_voices[:4]):
        _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
        _dir = folder + '_pkl/'
        if 'cmu-arctic' in _str:
            _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
                
        print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
        
        if (
            not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or 
            not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
        ):

            # Mimic3 GitHub Quota exceded:
            #    https://github.com/MycroftAI/mimic3-voices
            #    Above repo can exceed download quota of LFS
            # Copy mimic-voices from local copies
            #    clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
            #    copy to ~/
            # 
            #
            home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
            Path(home_voice_dir).mkdir(parents=True, exist_ok=True)


            speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice


            if (
                (not os.path.isfile(home_voice_dir + 'generator.onnx')) or 
                (os.path.getsize(home_voice_dir + 'generator.onnx') < 500)  # .onnx - is just LFS header
                    ):

                # Copy

                shutil.copyfile(
                    f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
                    home_voice_dir + 'generator.onnx')

            
            # pre made
            prompt_path  = f'mimic3_{folder}_4x/' + _str + '.wav'


            # ACTUAL TTS

            
            with open('harvard.json', 'r') as f:
                harvard_individual_sentences = json.load(f)['sentences']
            total_audio_mimic3 = []
            total_audio_stts2 = []
            ix = 0
            for list_of_10 in harvard_individual_sentences[:1]:  # 77
                text = ' '.join(list_of_10['sentences'])
                # harvard.append(long_sentence.replace('.', ' '))
                # for text in list_of_10['sentences']:
                style_vec = msinference.compute_style(prompt_path)
                print(ix, text)
                ix += 1


                x = msinference.inference(text,
                                            style_vec,
                                            alpha=0.3,
                                            beta=0.7,
                                            diffusion_steps=7,
                                            embedding_scale=1)
                
                total_audio_stts2.append(x)

                # also synthesize mimic with the same sentence and voice

                # MIMIC-3 = = = = = = = = = = = = = = BEGIN

                rate = 1  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
                _ssml = (
                    '<speak>'
                    '<prosody volume=\'64\'>'
                    f'<prosody rate=\'{rate}\'>'
                    f'<voice name=\'{_voice}\'>'
                    '<s>'
                    f'{text}'
                    '</s>'
                    '</voice>'
                    '</prosody>'
                    '</prosody>'
                    '</speak>'
                )
                with open('_tmp_ssml.txt', 'w') as f:
                    f.write(_ssml)


                # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
                # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
                args = get_args()
                args.ssml = True
                args.text = [_ssml]  #['aa', 'bb'] #txt
                args.interactive = False
                # args.output_naming = OutputNaming.TIME

                state = CommandLineInterfaceState(args=args)
                initialize_args(state)
                initialize_tts(state)
                # args.texts = [txt] #['aa', 'bb'] #txt
                # state.stdout = '.' #None #'makeme.wav'
                # state.output_dir = '.noopy'
                # state.interactive = False
                # state.output_naming = OutputNaming.TIME
                # # state.ssml = 1234546575
                # state.stdout = True
                # state.tts = True
                process_lines(state, wav_path='tmp1.wav')
                shutdown_tts(state)
                x, fs = audiofile.read('tmp1.wav')
                total_audio_mimic3.append(x)
                print(fs, text, 'mimic3')
                
                # MIMIC3 = = = = = = = = = = = = = = END


            total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
            audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)

            total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
            audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)

            print('Saving:', _dir + 'mimic3__' + _str + '.wav')
        else:
            print('Skip:', _dir + 'styletts2__' + _str + '.wav')

            
        # AUD   I N T E R F A C E
            # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
        for engine in ['mimic3', 'styletts2']:
            harvard_of_voice = f'{_dir}{engine}__{_str}'
            if not os.path.exists(harvard_of_voice + '.pkl'):
                df_pred = interface.process_file(harvard_of_voice + '.wav')
                df_pred.to_pickle(harvard_of_voice + '.pkl')
            else:
                print(harvard_of_voice + '.pkl', 'FOUND')