marcmaxmeister
commited on
Commit
•
f02201b
1
Parent(s):
7c15f78
Adding an example of using pretrained model to predict emotion in local audio file
Browse filesThis card had no practical example of how to use this model, but this repo (https://github.com/m3hrdadfi/soxan) had one that works with this model, so I am adding notes here for others.
README.md
CHANGED
@@ -6,4 +6,56 @@ tags:
|
|
6 |
- audio
|
7 |
- HUBert
|
8 |
---
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
- audio
|
7 |
- HUBert
|
8 |
---
|
9 |
+
|
10 |
+
|
11 |
+
Working example of using pretrained model to predict emotion in local audio file
|
12 |
+
|
13 |
+
```
|
14 |
+
|
15 |
+
def predict_emotion_hubert(audio_file):
|
16 |
+
""" inspired by an example from https://github.com/m3hrdadfi/soxan """
|
17 |
+
from audio_models import HubertForSpeechClassification
|
18 |
+
from transformers import Wav2Vec2FeatureExtractor, AutoConfig
|
19 |
+
import torch.nn.functional as F
|
20 |
+
import torch
|
21 |
+
import numpy as np
|
22 |
+
from pydub import AudioSegment
|
23 |
+
|
24 |
+
model = HubertForSpeechClassification.from_pretrained("Rajaram1996/Hubert_emotion") # Downloading: 362M
|
25 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
|
26 |
+
sampling_rate=16000 # defined by the model; must convert mp3 to this rate.
|
27 |
+
config = AutoConfig.from_pretrained("Rajaram1996/Hubert_emotion")
|
28 |
+
|
29 |
+
def speech_file_to_array(path, sampling_rate):
|
30 |
+
# using torchaudio...
|
31 |
+
# speech_array, _sampling_rate = torchaudio.load(path)
|
32 |
+
# resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
|
33 |
+
# speech = resampler(speech_array).squeeze().numpy()
|
34 |
+
sound = AudioSegment.from_file(path)
|
35 |
+
sound = sound.set_frame_rate(sampling_rate)
|
36 |
+
sound_array = np.array(sound.get_array_of_samples())
|
37 |
+
return sound_array
|
38 |
+
|
39 |
+
sound_array = speech_file_to_array(audio_file, sampling_rate)
|
40 |
+
inputs = feature_extractor(sound_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
41 |
+
inputs = {key: inputs[key].to("cpu").float() for key in inputs}
|
42 |
+
|
43 |
+
with torch.no_grad():
|
44 |
+
logits = model(**inputs).logits
|
45 |
+
|
46 |
+
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
|
47 |
+
outputs = [{
|
48 |
+
"emo": config.id2label[i],
|
49 |
+
"score": round(score * 100, 1)}
|
50 |
+
for i, score in enumerate(scores)
|
51 |
+
]
|
52 |
+
return [row for row in sorted(outputs, key=lambda x:x["score"], reverse=True) if row['score'] != '0.0%'][:2]
|
53 |
+
```
|
54 |
+
|
55 |
+
```
|
56 |
+
|
57 |
+
result = predict_emotion_hubert("male-crying.mp3")
|
58 |
+
>>> result
|
59 |
+
[{'emo': 'male_sad', 'score': 91.0}, {'emo': 'male_fear', 'score': 4.8}]
|
60 |
+
```
|
61 |
+
|