Batched inference does not yield same results as individual sample inference
Hi
First of all, thanks for the great work!
I wanted to use this model for batched inference. However when doing so, I noticed the results were different than doing inference on the individual samples differently. The following adapted example code can be run as a minimal reproducible example:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
class RegressionHead(nn.Module):
r"""Classification head."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class EmotionModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = RegressionHead(config)
self.init_weights()
def forward(
self,
input_values,
attention_mask,
):
outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits = self.classifier(hidden_states)
return hidden_states, logits
# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)
# dummy signal
sampling_rate = 16000
signals = []
for i in range(64):
signal = np.random.rand(5 * 16000)
# randomly truncate signal
signal = signal[:int(random.uniform(0, 5) * 16000)]
signals.append(signal)
def process_func(
file_path: str,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""
inputs = processor(signals, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
# batched_inference
with torch.no_grad():
y = model(inputs["input_values"], inputs["attention_mask"])[1]
y = y.detach().cpu().numpy()
pd.DataFrame({"valence": y[:, 0], "arousal": y[:, 1]}).to_csv('batched_audio_predictions.csv')
# non-batched inference without attention mask -> different outputs
y2_valence = []
y2_arousal = []
for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
y2 = model(input_values[attention_mask].reshape(1, -1), None)
y2_valence.append(y2[1][0][0].detach().cpu().numpy())
y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_predictions.csv')
# non-batched inference with attention mask -> same outputs as the batched_inference
y2_valence = []
y2_arousal = []
for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
y2 = model(input_values.unsqueeze(0), attention_mask.unsqueeze(0))
y2_valence.append(y2[1][0][0].detach().cpu().numpy())
y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_with_mask_predictions.csv')
return y
print(process_func("batched_audio_example.pt", sampling_rate))
# Arousal dominance valence
# [[0.5460754 0.6062266 0.40431657]]
Any ideas or experience as to why this is happening? It seems to me the attention mask (as I do in the example above) should be passed when doing batched inference as the "feat_extract_norm" parameter of the config is "layer" as stated in the Wav2Vec2 documentation:
Thanks in advance for your time and answer!
Hi!
Thanks a lot for your interest in the model.
I see several issues in the code you submitted:
EmotionModel.forward()
passes theattention_mask
toself.wav2vec2()
, but it is not considered when computing the mean across the embeddings.
To solve this, you can replace the line
hidden_states = torch.mean(hidden_states, dim=1)
by
if attention_mask is not None:
attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states = hidden_states * torch.reshape(attention_mask, (-1, attention_mask.shape[-1], 1))
hidden_states = torch.sum(hidden_states, dim=1)
attention_sum = torch.sum(attention_mask, dim=1)
hidden_states = hidden_states / torch.reshape(attention_sum, (-1, 1))
else:
hidden_states = torch.mean(hidden_states, dim=1)
- In the second inference option (non-batched inference without attention mask), where the attention is intended to be applied before the model is called,
attention_mask
is of typetorch.int32
. Withinput_values[attention_mask]
,input_values
is sliced (with 0 / 1 indexes). In order to apply a mask,attention_mask
needs to be converted to (e.g.) boolean, i.e.attention_mask.bool()
fortorch.Tensor
.
Applying these two changes will result in identical predictions.
In fact, all three options resulted in "wrong" predictions in the original, but the impact of issue 1. (which has an impact on the 1st and the 3rd inference), is not as severe as the impact of issue 2.
There are two more problems:
- The order of the output logits is:
Arousal, Dominance, Valence
(you are assumingValence
,Arousal
in your code) - The model has a minimum requirement for the input length. Thus, in some rare cases, the inference might fail if one of the input signals has a very short duration (<< 1.0s).
Please find the consolidated code below:
Click to expand code
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
class RegressionHead(nn.Module):
r"""Classification head."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class EmotionModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = RegressionHead(config)
self.init_weights()
def forward(
self,
input_values,
attention_mask,
):
outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
hidden_states = outputs[0]
if attention_mask is not None:
attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states = hidden_states * torch.reshape(attention_mask, (-1, attention_mask.shape[-1], 1))
hidden_states = torch.sum(hidden_states, dim=1)
attention_sum = torch.sum(attention_mask, dim=1)
hidden_states = hidden_states / torch.reshape(attention_sum, (-1, 1))
else:
hidden_states = torch.mean(hidden_states, dim=1)
logits = self.classifier(hidden_states)
return hidden_states, logits
# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)
# dummy signal
sampling_rate = 16000
signals = []
for i in range(64):
signal = np.random.rand(5 * 16000)
# randomly truncate signal
signal = signal[:int(random.uniform(0.1, 5) * 16000)]
signals.append(signal)
def process_func(
file_path: str,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""
inputs = processor(signals, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
# batched_inference
with torch.no_grad():
y = model(inputs["input_values"], inputs["attention_mask"])[1]
y = y.detach().cpu().numpy()
pd.DataFrame({"valence": y[:, 0], "arousal": y[:, 1]}).to_csv('batched_audio_predictions.csv')
# non-batched inference without attention mask -> different outputs
y2_valence = []
y2_arousal = []
for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
y2 = model(input_values[attention_mask.bool()].reshape(1, -1), None)
y2_valence.append(y2[1][0][0].detach().cpu().numpy())
y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_predictions.csv')
# non-batched inference with attention mask -> same outputs as the batched_inference
y2_valence = []
y2_arousal = []
for input_values, attention_mask in zip(inputs["input_values"], inputs["attention_mask"]):
y2 = model(input_values.unsqueeze(0), attention_mask.unsqueeze(0))
y2_valence.append(y2[1][0][0].detach().cpu().numpy())
y2_arousal.append(y2[1][0][1].detach().cpu().numpy())
pd.DataFrame({"valence": y2_valence, "arousal": y2_arousal}).to_csv('non_batched_audio_with_mask_predictions.csv')
return y
print(process_func("batched_audio_example.pt", sampling_rate))
# Arousal dominance valence
# [[0.5460754 0.6062266 0.40431657]]