Upload model
Browse files- config.json +19 -0
- pipeline_utils.py +165 -0
- pytorch_model.bin +3 -0
config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"SERModel"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "pipeline_utils.SERConfig",
|
7 |
+
"AutoModelForAudioClassification": "pipeline_utils.SERModel"
|
8 |
+
},
|
9 |
+
"classifier_dropout_prob": 0.5,
|
10 |
+
"classifier_hidden_layers": 1,
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"model_type": "ser",
|
13 |
+
"num_attention_heads": 16,
|
14 |
+
"num_classes": 1,
|
15 |
+
"num_hidden_layers": 24,
|
16 |
+
"ssl_type": "microsoft/wavlm-large",
|
17 |
+
"torch_dtype": "float32",
|
18 |
+
"transformers_version": "4.34.0.dev0"
|
19 |
+
}
|
pipeline_utils.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from transformers import AutoModel
|
5 |
+
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
6 |
+
|
7 |
+
|
8 |
+
class Pooling(nn.Module):
|
9 |
+
def __init__(self):
|
10 |
+
super().__init__()
|
11 |
+
def compute_length_from_mask(self, mask):
|
12 |
+
"""
|
13 |
+
mask: (batch_size, T)
|
14 |
+
Assuming that the sampling rate is 16kHz, the frame shift is 20ms
|
15 |
+
"""
|
16 |
+
wav_lens = torch.sum(mask, dim=1) # (batch_size, )
|
17 |
+
feat_lens = torch.div(wav_lens-1, 16000*0.02, rounding_mode="floor") + 1
|
18 |
+
feat_lens = feat_lens.int().tolist()
|
19 |
+
return feat_lens
|
20 |
+
|
21 |
+
def forward(self, x, mask):
|
22 |
+
raise NotImplementedError
|
23 |
+
|
24 |
+
class MeanPooling(Pooling):
|
25 |
+
def __init__(self):
|
26 |
+
super().__init__()
|
27 |
+
def forward(self, xs, mask):
|
28 |
+
"""
|
29 |
+
xs: (batch_size, T, feat_dim)
|
30 |
+
mask: (batch_size, T)
|
31 |
+
|
32 |
+
=> output: (batch_size, feat_dim)
|
33 |
+
"""
|
34 |
+
feat_lens = self.compute_length_from_mask(mask)
|
35 |
+
pooled_list = []
|
36 |
+
for x, feat_len in zip(xs, feat_lens):
|
37 |
+
pooled = torch.mean(x[:feat_len], dim=0) # (feat_dim, )
|
38 |
+
pooled_list.append(pooled)
|
39 |
+
pooled = torch.stack(pooled_list, dim=0) # (batch_size, feat_dim)
|
40 |
+
return pooled
|
41 |
+
|
42 |
+
|
43 |
+
class AttentiveStatisticsPooling(Pooling):
|
44 |
+
"""
|
45 |
+
AttentiveStatisticsPooling
|
46 |
+
Paper: Attentive Statistics Pooling for Deep Speaker Embedding
|
47 |
+
Link: https://arxiv.org/pdf/1803.10963.pdf
|
48 |
+
"""
|
49 |
+
def __init__(self, input_size):
|
50 |
+
super().__init__()
|
51 |
+
self._indim = input_size
|
52 |
+
self.sap_linear = nn.Linear(input_size, input_size)
|
53 |
+
self.attention = nn.Parameter(torch.FloatTensor(input_size, 1))
|
54 |
+
torch.nn.init.normal_(self.attention, mean=0, std=1)
|
55 |
+
|
56 |
+
def forward(self, xs, mask):
|
57 |
+
"""
|
58 |
+
xs: (batch_size, T, feat_dim)
|
59 |
+
mask: (batch_size, T)
|
60 |
+
|
61 |
+
=> output: (batch_size, feat_dim*2)
|
62 |
+
"""
|
63 |
+
feat_lens = self.compute_length_from_mask(mask)
|
64 |
+
pooled_list = []
|
65 |
+
for x, feat_len in zip(xs, feat_lens):
|
66 |
+
x = x[:feat_len].unsqueeze(0)
|
67 |
+
h = torch.tanh(self.sap_linear(x))
|
68 |
+
w = torch.matmul(h, self.attention).squeeze(dim=2)
|
69 |
+
w = F.softmax(w, dim=1).view(x.size(0), x.size(1), 1)
|
70 |
+
mu = torch.sum(x * w, dim=1)
|
71 |
+
rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
|
72 |
+
x = torch.cat((mu, rh), 1).squeeze(0)
|
73 |
+
pooled_list.append(x)
|
74 |
+
return torch.stack(pooled_list)
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
class EmotionRegression(nn.Module):
|
80 |
+
def __init__(self, *args, **kwargs):
|
81 |
+
super(EmotionRegression, self).__init__()
|
82 |
+
input_dim = args[0]
|
83 |
+
hidden_dim = args[1]
|
84 |
+
num_layers = args[2]
|
85 |
+
output_dim = args[3]
|
86 |
+
p = kwargs.get("dropout", 0.5)
|
87 |
+
|
88 |
+
self.fc=nn.ModuleList([
|
89 |
+
nn.Sequential(
|
90 |
+
nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
91 |
+
)
|
92 |
+
])
|
93 |
+
for lidx in range(num_layers-1):
|
94 |
+
self.fc.append(
|
95 |
+
nn.Sequential(
|
96 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(p)
|
97 |
+
)
|
98 |
+
)
|
99 |
+
self.out = nn.Sequential(
|
100 |
+
nn.Linear(hidden_dim, output_dim)
|
101 |
+
)
|
102 |
+
|
103 |
+
self.inp_drop = nn.Dropout(p)
|
104 |
+
def get_repr(self, x):
|
105 |
+
h = self.inp_drop(x)
|
106 |
+
for lidx, fc in enumerate(self.fc):
|
107 |
+
h=fc(h)
|
108 |
+
return h
|
109 |
+
|
110 |
+
def forward(self, x):
|
111 |
+
h=self.get_repr(x)
|
112 |
+
result = self.out(h)
|
113 |
+
return result
|
114 |
+
|
115 |
+
class SERConfig(PretrainedConfig):
|
116 |
+
model_type = "ser"
|
117 |
+
|
118 |
+
def __init__(
|
119 |
+
self,
|
120 |
+
num_classes: int = 1,
|
121 |
+
num_attention_heads = 16,
|
122 |
+
num_hidden_layers = 24,
|
123 |
+
hidden_size = 1024,
|
124 |
+
classifier_hidden_layers = 1,
|
125 |
+
classifier_dropout_prob = 0.5,
|
126 |
+
ssl_type= "microsoft/wavlm-large",
|
127 |
+
torch_dtype= "float32",
|
128 |
+
**kwargs,
|
129 |
+
):
|
130 |
+
self.num_classes = num_classes
|
131 |
+
self.num_attention_heads = num_attention_heads
|
132 |
+
self.num_hidden_layers = num_hidden_layers
|
133 |
+
self.hidden_size = hidden_size
|
134 |
+
self.classifier_hidden_layers = classifier_hidden_layers
|
135 |
+
self.classifier_dropout_prob = classifier_dropout_prob
|
136 |
+
self.ssl_type = ssl_type
|
137 |
+
self.torch_dtype = torch_dtype
|
138 |
+
super().__init__(**kwargs)
|
139 |
+
|
140 |
+
class SERModel(PreTrainedModel):
|
141 |
+
config_class = SERConfig
|
142 |
+
|
143 |
+
def __init__(self, config):
|
144 |
+
super().__init__(config)
|
145 |
+
self.ssl_model = AutoModel.from_pretrained(config.ssl_type)
|
146 |
+
self.ssl_model.freeze_feature_encoder()
|
147 |
+
|
148 |
+
self.pool_model = AttentiveStatisticsPooling(config.hidden_size)
|
149 |
+
|
150 |
+
self.ser_model = EmotionRegression(config.hidden_size*2,
|
151 |
+
config.hidden_size,
|
152 |
+
config.classifier_hidden_layers,
|
153 |
+
config.num_classes,
|
154 |
+
dropout=config.classifier_dropout_prob)
|
155 |
+
|
156 |
+
|
157 |
+
def forward(self, x, mask):
|
158 |
+
ssl = self.ssl_model(x, attention_mask=mask).last_hidden_state
|
159 |
+
|
160 |
+
ssl = self.pool_model(ssl, mask)
|
161 |
+
|
162 |
+
pred = self.ser_model(ssl)
|
163 |
+
|
164 |
+
return pred
|
165 |
+
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6fc0167d183d89114be10df1c4e4f74040b558408efee99a71fcf5205865ef2
|
3 |
+
size 1274585617
|