dkounadis
/

artificial-styletts2

+# we have to evaluate emotion & cer per sentence -> not audinterface sliding window
+import os
+import audresample
+import torch
+import matplotlib.pyplot as plt
+import soundfile
+import json
+import audb
+from transformers import AutoModelForAudioClassification
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
+import types
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import pandas as pd
+import json
+import numpy as np
+from pathlib import Path
+import transformers
+import torch
+import audmodel
+import audiofile
+import jiwer
+# https://arxiv.org/pdf/2407.12229
+#  https://arxiv.org/pdf/2312.05187
+# https://arxiv.org/abs/2407.05407
+# https://arxiv.org/pdf/2408.06577
+# https://arxiv.org/pdf/2309.07405
+import msinference
+import os
+from random import shuffle
+config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
+config.dev = torch.device('cuda:0')
+config.dev2 = torch.device('cuda:0')
+LABELS = ['arousal', 'dominance', 'valence',
+           'Angry',
+           'Sad',
+           'Happy',
+           'Surprise',
+            'Fear',
+            'Disgust',
+            'Contempt',
+            'Neutral'
+            ]
+config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
+config.dev = torch.device('cuda:0')
+config.dev2 = torch.device('cuda:0')
+    # https://arxiv.org/pdf/2407.12229
+    #  https://arxiv.org/pdf/2312.05187
+    # https://arxiv.org/abs/2407.05407
+    # https://arxiv.org/pdf/2408.06577
+    # https://arxiv.org/pdf/2309.07405
+def _infer(self, x):
+    '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # plus
+    x = self.ssl_model(x, attention_mask=None).last_hidden_state
+    # pool
+    h = self.pool_model.sap_linear(x).tanh()
+    w = torch.matmul(h, self.pool_model.attention)
+    w = w.softmax(1)
+    mu = (x * w).sum(1)
+    x = torch.cat(
+        [
+            mu,
+            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
+        ], 1)
+    return self.ser_model(x)
+teacher_cat = AutoModelForAudioClassification.from_pretrained(
+    '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
+    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
+).to(config.dev2).eval()
+teacher_cat.forward = types.MethodType(_infer, teacher_cat)
+# ===================[:]===================== Dawn
+def _prenorm(x, attention_mask=None):
+    '''mean/var'''
+    if attention_mask is not None:
+        N = attention_mask.sum(1, keepdim=True)  # here attn msk is unprocessed just the original input
+        x -= x.sum(1, keepdim=True) / N
+        var = (x * x).sum(1, keepdim=True) / N
+    else:
+        x -= x.mean(1, keepdim=True)  # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
+        var = (x * x).mean(1, keepdim=True)
+    return x / torch.sqrt(var + 1e-7)
+from torch import nn
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
+class RegressionHead(nn.Module):
+        r"""Classification head."""
+        def __init__(self, config):
+            super().__init__()
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.dropout = nn.Dropout(config.final_dropout)
+            self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+        def forward(self, features, **kwargs):
+            x = features
+            x = self.dropout(x)
+            x = self.dense(x)
+            x = torch.tanh(x)
+            x = self.dropout(x)
+            x = self.out_proj(x)
+            return x
+class Dawn(Wav2Vec2PreTrainedModel):
+    r"""Speech emotion classifier."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = RegressionHead(config)
+        self.init_weights()
+    def forward(
+            self,
+            input_values,
+            attention_mask=None,
+    ):
+        x = _prenorm(input_values, attention_mask=attention_mask)
+        outputs = self.wav2vec2(x, attention_mask=attention_mask)
+        hidden_states = outputs[0]
+        hidden_states = torch.mean(hidden_states, dim=1)
+        logits = self.classifier(hidden_states)
+        return logits
+        # return {'hidden_states': hidden_states,
+        #         'logits': logits}
+dawn = Dawn.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(config.dev).eval()
+# =======================================
+torch_dtype = torch.float16 #if torch.cuda.is_available() else torch.float32
+model_id = "openai/whisper-large-v3"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+).to(config.dev)
+processor = AutoProcessor.from_pretrained(model_id)
+_pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=30,
+    batch_size=16,
+    return_timestamps=True,
+    torch_dtype=torch_dtype,
+    device=config.dev,
+)
+def process_function(x, sampling_rate, idx):
+    # x = x[None , :]  ASaHSuFDCN
+    #  {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
+    #  4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
+    #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
+    logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).softmax(1)
+    logits_adv = dawn(torch.from_numpy(x).to(config.dev))
+    out = torch.cat([logits_adv,
+                            logits_cat],
+                            1).cpu().detach().numpy()
+    # print(out.shape)
+    return out[0, :]
+def load_speech(split=None):
+    DB = [
+        # [dataset, version, table, has_timdeltas_or_is_full_wavfile]
+          #  ['crema-d', '1.1.1', 'emotion.voice.test', False],
+        #['librispeech', '3.1.0', 'test-clean', False],
+            ['emodb',  '1.2.0', 'emotion.categories.train.gold_standard', False],
+  #          ['entertain-playtestcloud', '1.1.0', 'emotion.categories.train.gold_standard', True],
+   #         ['erik', '2.2.0', 'emotion.categories.train.gold_standard', True],
+    #        ['meld', '1.3.1', 'emotion.categories.train.gold_standard', False],
+            # ['msppodcast', '5.0.0', 'emotion.categories.train.gold_standard', False],  # tandalone bucket because it has gt labels?
+     #       ['myai', '1.0.1', 'emotion.categories.train.gold_standard', False],
+      #      ['casia', None, 'emotion.categories.gold_standard', False],
+            # ['switchboard-1', None, 'sentiment', True],
+            # ['swiss-parliament', None, 'segments', True],
+            # ['argentinian-parliament', None, 'segments', True],
+            # ['austrian-parliament', None, 'segments', True],
+            # #'german', --> bundestag
+            # ['brazilian-parliament', None, 'segments', True],
+            # ['mexican-parliament', None, 'segments', True],
+            # ['portuguese-parliament', None, 'segments', True],
+       #     ['spanish-parliament', None, 'segments', True],
+        #    ['chinese-vocal-emotions-liu-pell', None, 'emotion.categories.desired', False],
+            # peoples-speech slow
+         #   ['peoples-speech', None, 'train-initial', False]
+    ]
+    output_list = []
+    for database_name, ver, table, has_timedeltas in DB:
+        a = audb.load(database_name,
+                        sampling_rate=16000,
+                        format='wav',
+                        mixdown=True,
+                        version=ver,
+                        cache_root='/cache/audb/')
+        a = a[table].get()
+        if has_timedeltas:
+            print(f'{has_timedeltas=}')
+            # a = a.reset_index()[['file', 'start', 'end']]
+            # output_list += [[*t] for t
+            #         in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
+        else:
+            output_list += [f for f in a.index]  # use file (no timedeltas)
+    return output_list
+natural_wav_paths = load_speech()
+with open('harvard.json', 'r') as f:
+    harvard_individual_sentences = json.load(f)['sentences']
+synthetic_wav_paths = ['./enslow/' + i for i in
+                       os.listdir('./enslow/')]
+synthetic_wav_paths_4x = ['./style_vector_v2/' + i for i in
+                    os.listdir('./style_vector_v2/')]
+synthetic_wav_paths_foreign = ['./mimic3_foreign/' + i for i in os.listdir('./mimic3_foreign/') if 'en_U' not in i]
+synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir('./mimic3_foreign_4x/') if 'en_U' not in i]  # very short segments
+# filter very short styles
+synthetic_wav_paths_foreign = [i for i in synthetic_wav_paths_foreign if audiofile.duration(i) > 2]
+synthetic_wav_paths_foreign_4x = [i for i in synthetic_wav_paths_foreign_4x if audiofile.duration(i) > 2]
+synthetic_wav_paths = [i for i in synthetic_wav_paths if audiofile.duration(i) > 2]
+synthetic_wav_pathsn_4x = [i for i in synthetic_wav_paths_4x if audiofile.duration(i) > 2]
+shuffle(synthetic_wav_paths_foreign_4x)
+shuffle(synthetic_wav_paths_foreign)
+shuffle(synthetic_wav_paths)
+shuffle(synthetic_wav_paths_4x)
+print(len(synthetic_wav_paths_foreign_4x), len(synthetic_wav_paths_foreign),
+      len(synthetic_wav_paths), len(synthetic_wav_paths_4x))  # 134 204 134 204
+for audio_prompt in ['english',
+                     'english_4x',
+                     'human',
+                     'foreign',
+                     'foreign_4x']:   # each of these creates a separate pkl - so outer for
+    #
+    data = np.zeros((767, len(LABELS)*2 + 2))  # 720 x LABELS-prompt & LABELS-stts2 & cer-prompt & cer-stts2
+    #
+    OUT_FILE = f'{audio_prompt}_analytic.pkl'
+    if not os.path.isfile(OUT_FILE):
+        ix = 0
+        for list_of_10 in harvard_individual_sentences[:10004]:
+                # long_sentence = ' '.join(list_of_10['sentences'])
+                # harvard.append(long_sentence.replace('.', ' '))
+                for text in list_of_10['sentences']:
+                    if audio_prompt == 'english':
+                        _p = synthetic_wav_paths[ix % len(synthetic_wav_paths)]
+                        #  134
+                        style_vec = msinference.compute_style(_p)
+                    elif audio_prompt == 'english_4x':
+                        _p = synthetic_wav_paths_4x[ix % len(synthetic_wav_paths_4x)]
+                        # 134]
+                        style_vec = msinference.compute_style(_p)
+                    elif audio_prompt == 'human':
+                        _p = natural_wav_paths[ix % len(natural_wav_paths)]
+                        # ?
+                        style_vec = msinference.compute_style(_p)
+                    elif audio_prompt == 'foreign':
+                        _p = synthetic_wav_paths_foreign[ix % len(synthetic_wav_paths_foreign)]
+                        # 204 some short styles are discarded ~ 1180
+                        style_vec = msinference.compute_style(_p)
+                    elif audio_prompt == 'foreign_4x':
+                        _p = synthetic_wav_paths_foreign_4x[ix % len(synthetic_wav_paths_foreign_4x)]
+                        # 174
+                        style_vec = msinference.compute_style(_p)
+                    else:
+                        print('unknonw list of style vector')
+                    x = msinference.inference(text,
+                                                style_vec,
+                                                alpha=0.3,
+                                                beta=0.7,
+                                                diffusion_steps=7,
+                                                embedding_scale=1)
+                    x = audresample.resample(x, 24000, 16000)
+                    _st, fsr = audiofile.read(_p)
+                    _st = audresample.resample(_st, fsr, 16000)
+                    print(_st.shape, x.shape)
+                    emotion_of_prompt = process_function(_st, 16000, None)
+                    emotion_of_out = process_function(x, 16000, None)
+                    data[ix, :11] = emotion_of_prompt
+                    data[ix, 11:22] = emotion_of_out
+                    # 2 last columns is cer-prompt cer-styletts2
+                    transcription_prompt = _pipe(_st[0])
+                    transcription_styletts2 = _pipe(x[0])  # allow singleton for EMO process func
+                    # print(len(emotion_of_prompt + emotion_of_out), ix, text)
+                    print(transcription_prompt, transcription_styletts2)
+                    data[ix, 22] = jiwer.cer('Sweet dreams are made of this. I travel the world and the seven seas.',
+                                       transcription_prompt['text'])
+                    data[ix, 23] = jiwer.cer(text,
+                                       transcription_styletts2['text'])
+                    print(data[ix, :])
+                    ix += 1
+        df = pd.DataFrame(data, columns=['prompt-' + i for i in LABELS] + ['styletts2-' + i for i in LABELS] + ['cer-prompt', 'cer-styletts2'])
+        df.to_pickle(OUT_FILE)
+    else:
+        df = pd.read_pickle(OUT_FILE)
+        print('\nALREADY EXISTS\n{df}')
+#  From the pickle we should also run cer and whisper on every prompt