Spaces:
Sleeping
Sleeping
from transformers import Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel | |
from transformers.modeling_outputs import SequenceClassifierOutput | |
from typing import Optional, Tuple, Union | |
from torch.nn import MSELoss | |
import torch | |
import torch.nn as nn | |
class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel): | |
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert | |
def __init__(self, config): | |
super().__init__(config) | |
if hasattr(config, "add_adapter") and config.add_adapter: | |
raise ValueError( | |
"Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)" | |
) | |
self.wav2vec2_bert = Wav2Vec2BertModel(config) | |
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings | |
if config.use_weighted_layer_sum: | |
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) | |
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) | |
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def freeze_base_model(self): | |
""" | |
Calling this function will disable the gradient computation for the base model so that its parameters will not | |
be updated during training. Only the classification head will be updated. | |
""" | |
for param in self.wav2vec2_bert.parameters(): | |
param.requires_grad = False | |
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features | |
def forward( | |
self, | |
input_features: Optional[torch.Tensor], | |
attention_mask: Optional[torch.Tensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
labels: Optional[torch.Tensor] = None, | |
) -> Union[Tuple, SequenceClassifierOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | |
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., | |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If | |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy). | |
""" | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states | |
outputs = self.wav2vec2_bert( | |
input_features, | |
attention_mask=attention_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
if self.config.use_weighted_layer_sum: | |
hidden_states = outputs[_HIDDEN_STATES_START_POSITION] | |
hidden_states = torch.stack(hidden_states, dim=1) | |
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) | |
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) | |
else: | |
hidden_states = outputs[0] | |
hidden_states = self.projector(hidden_states) | |
if attention_mask is None: | |
pooled_output = hidden_states.mean(dim=1) | |
else: | |
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) | |
hidden_states[~padding_mask] = 0.0 | |
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) | |
logits = self.classifier(pooled_output) | |
logits = nn.functional.relu(logits) | |
loss = None | |
if labels is not None: | |
loss_fct = MSELoss() | |
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels)) | |
if not return_dict: | |
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] | |
return ((loss,) + output) if loss is not None else output | |
return SequenceClassifierOutput( | |
loss=loss, | |
logits=logits, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |