# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # -*- encoding: utf-8 -*- import torch import torch.nn as nn from asteroid_filterbanks import Encoder, ParamSincFB from .RawNetBasicBlock import Bottle2neck, PreEmphasis class RawNet3(nn.Module): def __init__(self, block, model_scale, context, summed, C=1024, **kwargs): super().__init__() nOut = kwargs["nOut"] self.context = context self.encoder_type = kwargs["encoder_type"] self.log_sinc = kwargs["log_sinc"] self.norm_sinc = kwargs["norm_sinc"] self.out_bn = kwargs["out_bn"] self.summed = summed self.preprocess = nn.Sequential( PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True) ) self.conv1 = Encoder( ParamSincFB( C // 4, 251, stride=kwargs["sinc_stride"], ) ) self.relu = nn.ReLU() self.bn1 = nn.BatchNorm1d(C // 4) self.layer1 = block( C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5 ) self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3) self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale) self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1) if self.context: attn_input = 1536 * 3 else: attn_input = 1536 print("self.encoder_type", self.encoder_type) if self.encoder_type == "ECA": attn_output = 1536 elif self.encoder_type == "ASP": attn_output = 1 else: raise ValueError("Undefined encoder") self.attention = nn.Sequential( nn.Conv1d(attn_input, 128, kernel_size=1), nn.ReLU(), nn.BatchNorm1d(128), nn.Conv1d(128, attn_output, kernel_size=1), nn.Softmax(dim=2), ) self.bn5 = nn.BatchNorm1d(3072) self.fc6 = nn.Linear(3072, nOut) self.bn6 = nn.BatchNorm1d(nOut) self.mp3 = nn.MaxPool1d(3) def forward(self, x): """ :param x: input mini-batch (bs, samp) """ with torch.cuda.amp.autocast(enabled=False): x = self.preprocess(x) x = torch.abs(self.conv1(x)) if self.log_sinc: x = torch.log(x + 1e-6) if self.norm_sinc == "mean": x = x - torch.mean(x, dim=-1, keepdim=True) elif self.norm_sinc == "mean_std": m = torch.mean(x, dim=-1, keepdim=True) s = torch.std(x, dim=-1, keepdim=True) s[s < 0.001] = 0.001 x = (x - m) / s if self.summed: x1 = self.layer1(x) x2 = self.layer2(x1) x3 = self.layer3(self.mp3(x1) + x2) else: x1 = self.layer1(x) x2 = self.layer2(x1) x3 = self.layer3(x2) x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1)) x = self.relu(x) t = x.size()[-1] if self.context: global_x = torch.cat( ( x, torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), torch.sqrt( torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4) ).repeat(1, 1, t), ), dim=1, ) else: global_x = x w = self.attention(global_x) mu = torch.sum(x * w, dim=2) sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)) x = torch.cat((mu, sg), 1) x = self.bn5(x) x = self.fc6(x) if self.out_bn: x = self.bn6(x) return x def MainModel(**kwargs): model = RawNet3(Bottle2neck, model_scale=8, context=True, summed=True, **kwargs) return model