|
import torch |
|
from speechbrain.inference.interfaces import Pretrained |
|
import openvino as ov |
|
|
|
class CustomEncoderWav2vec2Classifier(Pretrained): |
|
"""A ready-to-use class for utterance-level classification (e.g, speaker-id, |
|
language-id, emotion recognition, keyword spotting, etc). |
|
|
|
The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model |
|
are defined in the yaml file. If you want to |
|
convert the predicted index into a corresponding text label, please |
|
provide the path of the label_encoder in a variable called 'lab_encoder_file' |
|
within the yaml. |
|
|
|
The class can be used either to run only the encoder (encode_batch()) to |
|
extract embeddings or to run a classification step (classify_batch()). |
|
``` |
|
|
|
Example |
|
------- |
|
>>> import torchaudio |
|
>>> from speechbrain.pretrained import EncoderClassifier |
|
>>> # Model is downloaded from the speechbrain HuggingFace repo |
|
>>> tmpdir = getfixture("tmpdir") |
|
>>> classifier = EncoderClassifier.from_hparams( |
|
... source="speechbrain/spkrec-ecapa-voxceleb", |
|
... savedir=tmpdir, |
|
... ) |
|
|
|
>>> # Compute embeddings |
|
>>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav") |
|
>>> embeddings = classifier.encode_batch(signal) |
|
|
|
>>> # Classification |
|
>>> prediction = classifier .classify_batch(signal) |
|
""" |
|
|
|
def __init__(self, *args, model=None, |
|
audio_file_path=None, |
|
backend="pytorch", |
|
opts=None, |
|
torch_device="cpu", |
|
save_ov_model=False, |
|
**kwargs): |
|
super().__init__(*args, **kwargs) |
|
self.backend = backend |
|
if self.backend == "openvino": |
|
print("=" * 30) |
|
print("OpenVINO Backend Selected") |
|
print("=" * 30) |
|
|
|
self.core = ov.Core() |
|
self.ov_model = None |
|
self.torch_device = torch_device |
|
if model: |
|
print("\n[INFO] Preparing OpenVINO model...") |
|
self.get_ov_model(model, audio_file_path) |
|
print("[SUCCESS] OpenVINO IR model compiled for inference!\n") |
|
if self.ov_model: |
|
print("[INFO] Compiling OpenVINO IR model for inference...") |
|
self.compiled_model = self.core.compile_model(self.ov_model, |
|
device_name=opts["ov_device"], |
|
config=opts["config"]) |
|
print("[SUCCESS] OpenVINO IR model compiled for inference!\n") |
|
|
|
if save_ov_model: |
|
|
|
print("[INFO] Saving OpenVINO IR model to disk!\n") |
|
ov_ir_file_path = "./openvino_model/fp32/speechbrain_emotion_recog_ov_ir_model.xml" |
|
ov.save_model(self.ov_model, ov_ir_file_path) |
|
print(f"[SUCCESS] OpenVINO IR model file saved at {ov_ir_file_path}!\n") |
|
elif backend == "pytorch": |
|
self.torch_device = opts["torch_device"] |
|
|
|
def encode_batch(self, wavs, wav_lens=None, normalize=False): |
|
"""Encodes the input audio into a single vector embedding. |
|
|
|
The waveforms should already be in the model's desired format. |
|
You can call: |
|
``normalized = <this>.normalizer(signal, sample_rate)`` |
|
to get a correctly converted signal in most cases. |
|
|
|
Arguments |
|
--------- |
|
wavs : torch.tensor |
|
Batch of waveforms [batch, time, channels] or [batch, time] |
|
depending on the model. Make sure the sample rate is fs=16000 Hz. |
|
wav_lens : torch.tensor |
|
Lengths of the waveforms relative to the longest one in the |
|
batch, tensor of shape [batch]. The longest one should have |
|
relative length 1.0 and others len(waveform) / max_length. |
|
Used for ignoring padding. |
|
normalize : bool |
|
If True, it normalizes the embeddings with the statistics |
|
contained in mean_var_norm_emb. |
|
|
|
Returns |
|
------- |
|
torch.tensor |
|
The encoded batch |
|
""" |
|
|
|
if len(wavs.shape) == 1: |
|
wavs = wavs.unsqueeze(0) |
|
|
|
|
|
if wav_lens is None: |
|
wav_lens = torch.ones(wavs.shape[0], device=self.torch_device) |
|
|
|
|
|
wavs, wav_lens = wavs.to(self.torch_device), wav_lens.to(self.torch_device) |
|
wavs = wavs.float() |
|
|
|
if self.backend == "pytorch": |
|
|
|
outputs = self.mods.wav2vec2(wavs) |
|
elif self.backend == "openvino": |
|
|
|
outputs = self.ov_inference(wavs, wav_lens) |
|
|
|
|
|
outputs = self.mods.avg_pool(outputs, wav_lens) |
|
outputs = outputs.view(outputs.shape[0], -1) |
|
|
|
return outputs |
|
|
|
def classify_batch(self, wavs, wav_lens=None): |
|
"""Performs classification on the top of the encoded features. |
|
|
|
It returns the posterior probabilities, the index and, if the label |
|
encoder is specified it also the text label. |
|
|
|
Arguments |
|
--------- |
|
wavs : torch.tensor |
|
Batch of waveforms [batch, time, channels] or [batch, time] |
|
depending on the model. Make sure the sample rate is fs=16000 Hz. |
|
wav_lens : torch.tensor |
|
Lengths of the waveforms relative to the longest one in the |
|
batch, tensor of shape [batch]. The longest one should have |
|
relative length 1.0 and others len(waveform) / max_length. |
|
Used for ignoring padding. |
|
|
|
Returns |
|
------- |
|
out_prob |
|
The log posterior probabilities of each class ([batch, N_class]) |
|
score: |
|
It is the value of the log-posterior for the best class ([batch,]) |
|
index |
|
The indexes of the best class ([batch,]) |
|
text_lab: |
|
List with the text labels corresponding to the indexes. |
|
(label encoder should be provided). |
|
""" |
|
outputs = self.encode_batch(wavs, wav_lens) |
|
outputs = self.mods.output_mlp(outputs) |
|
out_prob = self.hparams.softmax(outputs) |
|
score, index = torch.max(out_prob, dim=-1) |
|
text_lab = self.hparams.label_encoder.decode_torch(index) |
|
return out_prob, score, index, text_lab |
|
|
|
def classify_file(self, path): |
|
"""Classifies the given audiofile into the given set of labels. |
|
|
|
Arguments |
|
--------- |
|
path : str |
|
Path to audio file to classify. |
|
|
|
Returns |
|
------- |
|
out_prob |
|
The log posterior probabilities of each class ([batch, N_class]) |
|
score: |
|
It is the value of the log-posterior for the best class ([batch,]) |
|
index |
|
The indexes of the best class ([batch,]) |
|
text_lab: |
|
List with the text labels corresponding to the indexes. |
|
(label encoder should be provided). |
|
""" |
|
waveform = self.load_audio(path) |
|
|
|
batch = waveform.unsqueeze(0) |
|
rel_length = torch.tensor([1.0]) |
|
outputs = self.encode_batch(batch, rel_length) |
|
outputs = self.mods.output_mlp(outputs).squeeze(1) |
|
out_prob = self.hparams.softmax(outputs) |
|
score, index = torch.max(out_prob, dim=-1) |
|
text_lab = self.hparams.label_encoder.decode_torch(index) |
|
return out_prob, score, index, text_lab |
|
|
|
def get_ov_model(self, torch_model, path): |
|
|
|
waveform = self.load_audio(path) |
|
wavs = waveform.unsqueeze(0) |
|
|
|
|
|
self.ov_model = ov.convert_model(torch_model, example_input=wavs) |
|
|
|
def ov_inference(self, wavs, wav_lens): |
|
output_tensor = self.compiled_model(wavs.float())[0] |
|
output_tensor = torch.from_numpy(output_tensor) |
|
return output_tensor |
|
|
|
def forward(self, wavs, wav_lens=None, normalize=False): |
|
return self.encode_batch( |
|
wavs=wavs, wav_lens=wav_lens, normalize=normalize |
|
) |
|
|