Dionyssos commited on
Commit
08238e0
1 Parent(s): 8a2aca3

add styles

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mimic3_make_harvard_sentences.py +88 -6
  2. style_vector/en_UK_apope.wav +0 -0
  3. style_vector/en_US_cmu_arctic_aew.wav +0 -0
  4. style_vector/en_US_cmu_arctic_ahw.wav +0 -0
  5. style_vector/en_US_cmu_arctic_aup.wav +0 -0
  6. style_vector/en_US_cmu_arctic_awbrms.wav +0 -0
  7. style_vector/en_US_cmu_arctic_axb.wav +0 -0
  8. style_vector/en_US_cmu_arctic_bdl.wav +0 -0
  9. style_vector/en_US_cmu_arctic_clb.wav +0 -0
  10. style_vector/en_US_cmu_arctic_eey.wav +0 -0
  11. style_vector/en_US_cmu_arctic_fem.wav +0 -0
  12. style_vector/en_US_cmu_arctic_gka.wav +0 -0
  13. style_vector/en_US_cmu_arctic_jmk.wav +0 -0
  14. style_vector/en_US_cmu_arctic_ksp.wav +0 -0
  15. style_vector/en_US_cmu_arctic_ljm.wav +0 -0
  16. style_vector/en_US_cmu_arctic_lnh.wav +0 -0
  17. style_vector/en_US_cmu_arctic_rxr.wav +0 -0
  18. style_vector/en_US_cmu_arctic_slp.wav +0 -0
  19. style_vector/en_US_cmu_arctic_slt.wav +0 -0
  20. style_vector/en_US_hifi-tts_6097.wav +0 -0
  21. style_vector/en_US_hifi-tts_9017.wav +0 -0
  22. style_vector/en_US_hifi-tts_92.wav +0 -0
  23. style_vector/en_US_ljspeech.wav +0 -0
  24. style_vector/en_US_m-ailabs_elliot_miller.wav +0 -0
  25. style_vector/en_US_m-ailabs_judy_bieber.wav +0 -0
  26. style_vector/en_US_m-ailabs_mary_ann.wav +0 -0
  27. style_vector/en_US_vctk_p225.wav +0 -0
  28. style_vector/en_US_vctk_p226.wav +0 -0
  29. style_vector/en_US_vctk_p227.wav +0 -0
  30. style_vector/en_US_vctk_p228.wav +0 -0
  31. style_vector/en_US_vctk_p229.wav +0 -0
  32. style_vector/en_US_vctk_p230.wav +0 -0
  33. style_vector/en_US_vctk_p231.wav +0 -0
  34. style_vector/en_US_vctk_p232.wav +0 -0
  35. style_vector/en_US_vctk_p233.wav +0 -0
  36. style_vector/en_US_vctk_p234.wav +0 -0
  37. style_vector/en_US_vctk_p236.wav +0 -0
  38. style_vector/en_US_vctk_p237.wav +0 -0
  39. style_vector/en_US_vctk_p238.wav +0 -0
  40. style_vector/en_US_vctk_p239.wav +0 -0
  41. style_vector/en_US_vctk_p240.wav +0 -0
  42. style_vector/en_US_vctk_p241.wav +0 -0
  43. style_vector/en_US_vctk_p243.wav +0 -0
  44. style_vector/en_US_vctk_p244.wav +0 -0
  45. style_vector/en_US_vctk_p245.wav +0 -0
  46. style_vector/en_US_vctk_p246.wav +0 -0
  47. style_vector/en_US_vctk_p247.wav +0 -0
  48. style_vector/en_US_vctk_p248.wav +0 -0
  49. style_vector/en_US_vctk_p249.wav +0 -0
  50. style_vector/en_US_vctk_p250.wav +0 -0
mimic3_make_harvard_sentences.py CHANGED
@@ -77,6 +77,21 @@ list_voices = [
77
 
78
 
79
  # ================================================== INTERFACE MODELS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  LABELS = [
81
  'arousal', 'dominance', 'valence',
82
  # 'speech_synthesizer', 'synthetic_singing',
@@ -131,10 +146,77 @@ teacher_cat.forward = types.MethodType(_infer, teacher_cat)
131
 
132
 
133
 
134
- # Audioset & ADV
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- # audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
137
- adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
138
 
139
  def process_function(x, sampling_rate, idx):
140
  '''run audioset ct, adv
@@ -154,7 +236,7 @@ def process_function(x, sampling_rate, idx):
154
  # logits_audioset = audioset_model(x, 16000)['logits_sounds']
155
  # logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
156
  # --
157
- logits_adv = adv_model(x, 16000)['logits']
158
 
159
  cat = np.concatenate([logits_adv,
160
  # _sigmoid(logits_audioset),
@@ -169,7 +251,7 @@ interface = audinterface.Feature(
169
  # process_func_args={'outputs': 'logits_scene'},
170
  process_func_applies_sliding_window=False,
171
  win_dur=7.0,
172
- hop_dur=4.0,
173
  sampling_rate=16000,
174
  resample=True,
175
  verbose=True,
@@ -297,7 +379,7 @@ for _id, _voice in enumerate(list_voices):
297
  total_audio_mimic3 = []
298
  total_audio_styletts2 = []
299
  ix = 0
300
- for list_of_10 in harvard_individual_sentences[:1000]: # 77
301
 
302
  text = ' '.join(list_of_10['sentences'])
303
 
 
77
 
78
 
79
  # ================================================== INTERFACE MODELS
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
  LABELS = [
96
  'arousal', 'dominance', 'valence',
97
  # 'speech_synthesizer', 'synthetic_singing',
 
146
 
147
 
148
 
149
+ # ===================[:]===================== Dawn
150
+ def _prenorm(x, attention_mask=None):
151
+ '''mean/var'''
152
+ if attention_mask is not None:
153
+ N = attention_mask.sum(1, keepdim=True) # here attn msk is unprocessed just the original input
154
+ x -= x.sum(1, keepdim=True) / N
155
+ var = (x * x).sum(1, keepdim=True) / N
156
+
157
+ else:
158
+ x -= x.mean(1, keepdim=True) # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
159
+ var = (x * x).mean(1, keepdim=True)
160
+ return x / torch.sqrt(var + 1e-7)
161
+
162
+ from torch import nn
163
+ from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
164
+ class RegressionHead(nn.Module):
165
+ r"""Classification head."""
166
+
167
+ def __init__(self, config):
168
+
169
+ super().__init__()
170
+
171
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
172
+ self.dropout = nn.Dropout(config.final_dropout)
173
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
174
+
175
+ def forward(self, features, **kwargs):
176
+
177
+ x = features
178
+ x = self.dropout(x)
179
+ x = self.dense(x)
180
+ x = torch.tanh(x)
181
+ x = self.dropout(x)
182
+ x = self.out_proj(x)
183
+
184
+ return x
185
+
186
+
187
+ class Dawn(Wav2Vec2PreTrainedModel):
188
+ r"""Speech emotion classifier."""
189
+
190
+ def __init__(self, config):
191
+
192
+ super().__init__(config)
193
+
194
+ self.config = config
195
+ self.wav2vec2 = Wav2Vec2Model(config)
196
+ self.classifier = RegressionHead(config)
197
+ self.init_weights()
198
+
199
+ def forward(
200
+ self,
201
+ input_values,
202
+ attention_mask=None,
203
+ ):
204
+ x = _prenorm(input_values, attention_mask=attention_mask)
205
+ outputs = self.wav2vec2(x, attention_mask=attention_mask)
206
+ hidden_states = outputs[0]
207
+ hidden_states = torch.mean(hidden_states, dim=1)
208
+ logits = self.classifier(hidden_states)
209
+ return logits
210
+ # return {'hidden_states': hidden_states,
211
+ # 'logits': logits}
212
+ dawn = Dawn.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(config.dev).eval()
213
+ # =======================================
214
+
215
+
216
+
217
+
218
+
219
 
 
 
220
 
221
  def process_function(x, sampling_rate, idx):
222
  '''run audioset ct, adv
 
236
  # logits_audioset = audioset_model(x, 16000)['logits_sounds']
237
  # logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
238
  # --
239
+ logits_adv = dawn(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy() #['logits']
240
 
241
  cat = np.concatenate([logits_adv,
242
  # _sigmoid(logits_audioset),
 
251
  # process_func_args={'outputs': 'logits_scene'},
252
  process_func_applies_sliding_window=False,
253
  win_dur=7.0,
254
+ hop_dur=40.0,
255
  sampling_rate=16000,
256
  resample=True,
257
  verbose=True,
 
379
  total_audio_mimic3 = []
380
  total_audio_styletts2 = []
381
  ix = 0
382
+ for list_of_10 in harvard_individual_sentences[:4]: # 77
383
 
384
  text = ' '.join(list_of_10['sentences'])
385
 
style_vector/en_UK_apope.wav ADDED
Binary file (99.9 kB). View file
 
style_vector/en_US_cmu_arctic_aew.wav ADDED
Binary file (96.3 kB). View file
 
style_vector/en_US_cmu_arctic_ahw.wav ADDED
Binary file (95.8 kB). View file
 
style_vector/en_US_cmu_arctic_aup.wav ADDED
Binary file (90.2 kB). View file
 
style_vector/en_US_cmu_arctic_awbrms.wav ADDED
Binary file (92.7 kB). View file
 
style_vector/en_US_cmu_arctic_axb.wav ADDED
Binary file (92.2 kB). View file
 
style_vector/en_US_cmu_arctic_bdl.wav ADDED
Binary file (90.7 kB). View file
 
style_vector/en_US_cmu_arctic_clb.wav ADDED
Binary file (96.3 kB). View file
 
style_vector/en_US_cmu_arctic_eey.wav ADDED
Binary file (90.7 kB). View file
 
style_vector/en_US_cmu_arctic_fem.wav ADDED
Binary file (90.2 kB). View file
 
style_vector/en_US_cmu_arctic_gka.wav ADDED
Binary file (90.7 kB). View file
 
style_vector/en_US_cmu_arctic_jmk.wav ADDED
Binary file (92.7 kB). View file
 
style_vector/en_US_cmu_arctic_ksp.wav ADDED
Binary file (93.7 kB). View file
 
style_vector/en_US_cmu_arctic_ljm.wav ADDED
Binary file (89.1 kB). View file
 
style_vector/en_US_cmu_arctic_lnh.wav ADDED
Binary file (91.2 kB). View file
 
style_vector/en_US_cmu_arctic_rxr.wav ADDED
Binary file (93.2 kB). View file
 
style_vector/en_US_cmu_arctic_slp.wav ADDED
Binary file (93.2 kB). View file
 
style_vector/en_US_cmu_arctic_slt.wav ADDED
Binary file (92.2 kB). View file
 
style_vector/en_US_hifi-tts_6097.wav ADDED
Binary file (89.1 kB). View file
 
style_vector/en_US_hifi-tts_9017.wav ADDED
Binary file (88.6 kB). View file
 
style_vector/en_US_hifi-tts_92.wav ADDED
Binary file (90.7 kB). View file
 
style_vector/en_US_ljspeech.wav ADDED
Binary file (101 kB). View file
 
style_vector/en_US_m-ailabs_elliot_miller.wav ADDED
Binary file (102 kB). View file
 
style_vector/en_US_m-ailabs_judy_bieber.wav ADDED
Binary file (104 kB). View file
 
style_vector/en_US_m-ailabs_mary_ann.wav ADDED
Binary file (103 kB). View file
 
style_vector/en_US_vctk_p225.wav ADDED
Binary file (96.8 kB). View file
 
style_vector/en_US_vctk_p226.wav ADDED
Binary file (98.3 kB). View file
 
style_vector/en_US_vctk_p227.wav ADDED
Binary file (97.8 kB). View file
 
style_vector/en_US_vctk_p228.wav ADDED
Binary file (94.8 kB). View file
 
style_vector/en_US_vctk_p229.wav ADDED
Binary file (95.3 kB). View file
 
style_vector/en_US_vctk_p230.wav ADDED
Binary file (95.8 kB). View file
 
style_vector/en_US_vctk_p231.wav ADDED
Binary file (94.8 kB). View file
 
style_vector/en_US_vctk_p232.wav ADDED
Binary file (93.7 kB). View file
 
style_vector/en_US_vctk_p233.wav ADDED
Binary file (95.8 kB). View file
 
style_vector/en_US_vctk_p234.wav ADDED
Binary file (95.8 kB). View file
 
style_vector/en_US_vctk_p236.wav ADDED
Binary file (93.2 kB). View file
 
style_vector/en_US_vctk_p237.wav ADDED
Binary file (95.3 kB). View file
 
style_vector/en_US_vctk_p238.wav ADDED
Binary file (103 kB). View file
 
style_vector/en_US_vctk_p239.wav ADDED
Binary file (94.8 kB). View file
 
style_vector/en_US_vctk_p240.wav ADDED
Binary file (97.8 kB). View file
 
style_vector/en_US_vctk_p241.wav ADDED
Binary file (93.2 kB). View file
 
style_vector/en_US_vctk_p243.wav ADDED
Binary file (97.3 kB). View file
 
style_vector/en_US_vctk_p244.wav ADDED
Binary file (93.7 kB). View file
 
style_vector/en_US_vctk_p245.wav ADDED
Binary file (98.3 kB). View file
 
style_vector/en_US_vctk_p246.wav ADDED
Binary file (98.3 kB). View file
 
style_vector/en_US_vctk_p247.wav ADDED
Binary file (97.3 kB). View file
 
style_vector/en_US_vctk_p248.wav ADDED
Binary file (102 kB). View file
 
style_vector/en_US_vctk_p249.wav ADDED
Binary file (96.3 kB). View file
 
style_vector/en_US_vctk_p250.wav ADDED
Binary file (93.2 kB). View file