dkounadis commited on
Commit
a3b1af9
1 Parent(s): e30a816
Files changed (1) hide show
  1. README.md +12 -12
README.md CHANGED
@@ -44,15 +44,14 @@ Florian Eyben, Felix Burkhardt, Björn Schuller.
44
  # Usage
45
  ```python
46
  from transformers import AutoModelForAudioClassification
 
 
47
  import torch
48
  import types
49
  import torch.nn as nn
50
- from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2Model,
51
- Wav2Vec2PreTrainedModel)
52
 
53
  # speech signal 16 KHz
54
- sampling_rate = 16000
55
- signal = torch.zeros((1, sampling_rate))
56
  device = 'cpu'
57
 
58
  class RegressionHead(nn.Module):
@@ -67,8 +66,6 @@ class RegressionHead(nn.Module):
67
  return self.out_proj(x.tanh())
68
 
69
  class Dawn(Wav2Vec2PreTrainedModel):
70
- r"""https://arxiv.org/abs/2203.07378"""
71
-
72
  def __init__(self, config):
73
 
74
  super().__init__(config)
@@ -83,7 +80,7 @@ class Dawn(Wav2Vec2PreTrainedModel):
83
  x = self.wav2vec2(x / variance.sqrt())[0]
84
  return self.classifier(x.mean(1)).clip(0, 1)
85
 
86
- def _infer(self, x):
87
  '''x: (batch, audio-samples-16KHz)'''
88
  x = (x + self.config.mean) / self.config.std # plus
89
  x = self.ssl_model(x, attention_mask=None).last_hidden_state
@@ -99,22 +96,25 @@ def _infer(self, x):
99
  ], 1)
100
  return self.ser_model(x)
101
 
 
102
  # WavLM
 
103
  base = AutoModelForAudioClassification.from_pretrained(
104
- '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
105
- trust_remote_code=True # fun definitions see 3loi/SER-.. repo
106
- ).to(device).eval()
107
- base.forward = types.MethodType(_infer, base)
108
 
109
  # Wav2Vec2.0
 
110
  dawn = Dawn.from_pretrained(
111
- 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
112
  ).to(device).eval()
113
 
114
 
115
  def wav2small(x):
116
  return .5 * dawn(x) + .5 * base(x)
117
 
 
118
  with torch.no_grad():
119
  pred = wav2small(signal.to(device))
120
  print(f'\nArousal = {pred[0, 0]} Dominance= {pred[0, 1]}',
 
44
  # Usage
45
  ```python
46
  from transformers import AutoModelForAudioClassification
47
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2Model,
48
+ Wav2Vec2PreTrainedModel)
49
  import torch
50
  import types
51
  import torch.nn as nn
 
 
52
 
53
  # speech signal 16 KHz
54
+ signal = torch.rand((1, 16000))
 
55
  device = 'cpu'
56
 
57
  class RegressionHead(nn.Module):
 
66
  return self.out_proj(x.tanh())
67
 
68
  class Dawn(Wav2Vec2PreTrainedModel):
 
 
69
  def __init__(self, config):
70
 
71
  super().__init__(config)
 
80
  x = self.wav2vec2(x / variance.sqrt())[0]
81
  return self.classifier(x.mean(1)).clip(0, 1)
82
 
83
+ def _forward(self, x):
84
  '''x: (batch, audio-samples-16KHz)'''
85
  x = (x + self.config.mean) / self.config.std # plus
86
  x = self.ssl_model(x, attention_mask=None).last_hidden_state
 
96
  ], 1)
97
  return self.ser_model(x)
98
 
99
+
100
  # WavLM
101
+
102
  base = AutoModelForAudioClassification.from_pretrained(
103
+ '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
104
+ trust_remote_code=True).to(device).eval()
105
+ base.forward = types.MethodType(_forward, base)
 
106
 
107
  # Wav2Vec2.0
108
+
109
  dawn = Dawn.from_pretrained(
110
+ 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
111
  ).to(device).eval()
112
 
113
 
114
  def wav2small(x):
115
  return .5 * dawn(x) + .5 * base(x)
116
 
117
+
118
  with torch.no_grad():
119
  pred = wav2small(signal.to(device))
120
  print(f'\nArousal = {pred[0, 0]} Dominance= {pred[0, 1]}',