dkounadis
/

wav2small

@@ -16,8 +16,8 @@ tags:
 - speech-emotion-recognition
 - dkounadis
 ---
-Tecaher model based on [Wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal/dominance/valence prediction.
-Achieves `0.6760566` valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) Test1.
@@ -26,7 +26,7 @@ Achieves `0.6760566` valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/resea
   <tr><th colspan=6 align="center" >CCC MSP Podcast v1.7</th></tr>
   <tr><th colspan=3 align="center">Test 1</th><th colspan=3 align="center">Test 2</th></tr>
   <tr>   <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
-  <tr>  <td> 0.6760566 </td> <td>0.6840190</td> <td>0.7620374</td> <td>0.4229267</td> <td>0.4684658</td> <td>0.4857733</td> </tr>
 </table>
@@ -43,7 +43,7 @@ from transformers.models.wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2PreTrainedModel,
 )
-device = 'cuda:0'
 class RegressionHead(nn.Module):
@@ -74,67 +74,54 @@ class Dawn(Wav2Vec2PreTrainedModel):
         self.wav2vec2 = Wav2Vec2Model(config)
         self.classifier = RegressionHead(config)
-    def forward(
-            self,
-            x,
-    ):
         x = x - x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
         out = self.wav2vec2(x / variance.sqrt())
         return self.classifier(out[0].mean(1)).clip(0, 1)
 def _infer(self, x):
-    '''re-definition for less cpu'''
-    # x = (x + 8.278621631819787e-05) / 0.08485610250851999
-    x = (x + self.config.mean) / self.config.std
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
     w = torch.matmul(h, self.pool_model.attention)
     w = w.softmax(1)
-    mu = torch.sum(x * w, 1)
     x = torch.cat(
-            [
-                mu,
-                ((x * x * w).sum(1) - mu * mu).clamp(min=1e-5).sqrt()
-            ], 1)
-    return self.ser_model(x).clip(0, 1)
 # WavLM
-# https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
 base = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
-    trust_remote_code=True  # extra definitions see above repository
 ).to(device).eval()
 base.forward = types.MethodType(_infer, base)
-# Wav2Vec2.0
 dawn = Dawn.from_pretrained(
     'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
-).to(device)
-# Teacher
 def wav2small(x):
-    '''average predctions'''
     return .5 * dawn(x) + .5 * base(x)
 x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
 with torch.no_grad():
-    pred = wav2small(
-        torch.from_numpy(x[None, :]
-                         ).to(device))
-print(f'\narousal  = {pred[0, 0]}',
-      f'\ndominance= {pred[0, 1]}',
-      f'\nvalence  = {pred[0, 2]}')
 ```

 - speech-emotion-recognition
 - dkounadis
 ---
+Model based on [Wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal/dominance/valence prediction.
+Achieves `0.6760566` valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) Test1. Used as teacher for [wav2small]().
   <tr><th colspan=6 align="center" >CCC MSP Podcast v1.7</th></tr>
   <tr><th colspan=3 align="center">Test 1</th><th colspan=3 align="center">Test 2</th></tr>
   <tr>   <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
+  <tr>  <td> 0.6760566 </td> <td>0.6840044</td> <td>0.7620181</td> <td>0.4229267</td> <td>0.4684658</td> <td>0.4857733</td> </tr>
 </table>
     Wav2Vec2PreTrainedModel,
 )
+device = 'cpu'
 class RegressionHead(nn.Module):
         self.wav2vec2 = Wav2Vec2Model(config)
         self.classifier = RegressionHead(config)
+    def forward(self, x):
+        '''x: (batch, audio-samples-16KHz)'''
         x = x - x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
         out = self.wav2vec2(x / variance.sqrt())
         return self.classifier(out[0].mean(1)).clip(0, 1)
 def _infer(self, x):
+    '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # plus
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     # pool
     h = self.pool_model.sap_linear(x).tanh()
     w = torch.matmul(h, self.pool_model.attention)
     w = w.softmax(1)
+    mu = (x * w).sum(1)
     x = torch.cat(
+        [
+            mu,
+            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
+        ], 1)
+    return self.ser_model(x)
 # WavLM
 base = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
+    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
 ).to(device).eval()
 base.forward = types.MethodType(_infer, base)
+# Wav2Vec2
 dawn = Dawn.from_pretrained(
     'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
+).to(device).eval()
 def wav2small(x):
     return .5 * dawn(x) + .5 * base(x)
 x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
 with torch.no_grad():
+    pred = wav2small(torch.from_numpy(x[None, :]).to(device))
+print(f'\nArousal  = {pred[0, 0]} Dominance= {pred[0, 1]}',
+      f' Valence  = {pred[0, 2]}')
 ```