# coding: utf-8 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable import torchaudio from models.modules import Conv_1d, ResSE_1d, Conv_2d, Res_2d, Conv_V, Conv_H, HarmonicSTFT, Res_2d_mp from models.attention_modules import BertConfig, BertEncoder, BertEmbeddings, BertPooler, PositionalEncoding class FCN(nn.Module): ''' Choi et al. 2016 Automatic tagging using deep convolutional neural networks. Fully convolutional network. ''' def __init__(self, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=96, n_class=50): super(FCN, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # FCN self.layer1 = Conv_2d(1, 64, pooling=(2,4)) self.layer2 = Conv_2d(64, 128, pooling=(2,4)) self.layer3 = Conv_2d(128, 128, pooling=(2,4)) self.layer4 = Conv_2d(128, 128, pooling=(3,5)) self.layer5 = Conv_2d(128, 64, pooling=(4,4)) # Dense self.dense = nn.Linear(64, n_class) self.dropout = nn.Dropout(0.5) def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # FCN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) # Dense x = x.view(x.size(0), -1) x = self.dropout(x) x = self.dense(x) x = nn.Sigmoid()(x) return x class Musicnn(nn.Module): ''' Pons et al. 2017 End-to-end learning for music audio tagging at scale. This is the updated implementation of the original paper. Referred to the Musicnn code. https://github.com/jordipons/musicnn ''' def __init__(self, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=96, n_class=50, dataset='mtat'): super(Musicnn, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # Pons front-end m1 = Conv_V(1, 204, (int(0.7*96), 7)) m2 = Conv_V(1, 204, (int(0.4*96), 7)) m3 = Conv_H(1, 51, 129) m4 = Conv_H(1, 51, 65) m5 = Conv_H(1, 51, 33) self.layers = nn.ModuleList([m1, m2, m3, m4, m5]) # Pons back-end backend_channel= 512 if dataset=='msd' else 64 self.layer1 = Conv_1d(561, backend_channel, 7, 1, 1) self.layer2 = Conv_1d(backend_channel, backend_channel, 7, 1, 1) self.layer3 = Conv_1d(backend_channel, backend_channel, 7, 1, 1) # Dense dense_channel = 500 if dataset=='msd' else 200 self.dense1 = nn.Linear((561+(backend_channel*3))*2, dense_channel) self.bn = nn.BatchNorm1d(dense_channel) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.5) self.dense2 = nn.Linear(dense_channel, n_class) def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # Pons front-end out = [] for layer in self.layers: out.append(layer(x)) out = torch.cat(out, dim=1) # Pons back-end length = out.size(2) res1 = self.layer1(out) res2 = self.layer2(res1) + res1 res3 = self.layer3(res2) + res2 out = torch.cat([out, res1, res2, res3], 1) mp = nn.MaxPool1d(length)(out) avgp = nn.AvgPool1d(length)(out) out = torch.cat([mp, avgp], dim=1) out = out.squeeze(2) out = self.relu(self.bn(self.dense1(out))) out = self.dropout(out) out = self.dense2(out) out = nn.Sigmoid()(out) return out class CRNN(nn.Module): ''' Choi et al. 2017 Convolution recurrent neural networks for music classification. Feature extraction with CNN + temporal summary with RNN ''' def __init__(self, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=96, n_class=50): super(CRNN, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # CNN self.layer1 = Conv_2d(1, 64, pooling=(2,2)) self.layer2 = Conv_2d(64, 128, pooling=(3,3)) self.layer3 = Conv_2d(128, 128, pooling=(4,4)) self.layer4 = Conv_2d(128, 128, pooling=(4,4)) # RNN self.layer5 = nn.GRU(128, 32, 2, batch_first=True) # Dense self.dropout = nn.Dropout(0.5) self.dense = nn.Linear(32, 50) def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # CCN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) # RNN x = x.squeeze(2) x = x.permute(0, 2, 1) x, _ = self.layer5(x) x = x[:, -1, :] # Dense x = self.dropout(x) x = self.dense(x) x = nn.Sigmoid()(x) return x class SampleCNN(nn.Module): ''' Lee et al. 2017 Sample-level deep convolutional neural networks for music auto-tagging using raw waveforms. Sample-level CNN. ''' def __init__(self, n_class=50): super(SampleCNN, self).__init__() self.layer1 = Conv_1d(1, 128, shape=3, stride=3, pooling=1) self.layer2 = Conv_1d(128, 128, shape=3, stride=1, pooling=3) self.layer3 = Conv_1d(128, 128, shape=3, stride=1, pooling=3) self.layer4 = Conv_1d(128, 256, shape=3, stride=1, pooling=3) self.layer5 = Conv_1d(256, 256, shape=3, stride=1, pooling=3) self.layer6 = Conv_1d(256, 256, shape=3, stride=1, pooling=3) self.layer7 = Conv_1d(256, 256, shape=3, stride=1, pooling=3) self.layer8 = Conv_1d(256, 256, shape=3, stride=1, pooling=3) self.layer9 = Conv_1d(256, 256, shape=3, stride=1, pooling=3) self.layer10 = Conv_1d(256, 512, shape=3, stride=1, pooling=3) self.layer11 = Conv_1d(512, 512, shape=1, stride=1, pooling=1) self.dropout = nn.Dropout(0.5) self.dense = nn.Linear(512, n_class) def forward(self, x): x = x.unsqueeze(1) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = self.layer8(x) x = self.layer9(x) x = self.layer10(x) x = self.layer11(x) x = x.squeeze(-1) x = self.dropout(x) x = self.dense(x) x = nn.Sigmoid()(x) return x class SampleCNNSE(nn.Module): ''' Kim et al. 2018 Sample-level CNN architectures for music auto-tagging using raw waveforms. Sample-level CNN + residual connections + squeeze & excitation. ''' def __init__(self, n_class=50): super(SampleCNNSE, self).__init__() self.layer1 = ResSE_1d(1, 128, shape=3, stride=3, pooling=1) self.layer2 = ResSE_1d(128, 128, shape=3, stride=1, pooling=3) self.layer3 = ResSE_1d(128, 128, shape=3, stride=1, pooling=3) self.layer4 = ResSE_1d(128, 256, shape=3, stride=1, pooling=3) self.layer5 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3) self.layer6 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3) self.layer7 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3) self.layer8 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3) self.layer9 = ResSE_1d(256, 256, shape=3, stride=1, pooling=3) self.layer10 = ResSE_1d(256, 512, shape=3, stride=1, pooling=3) self.layer11 = ResSE_1d(512, 512, shape=1, stride=1, pooling=1) self.dropout = nn.Dropout(0.5) self.dense1 = nn.Linear(512, 512) self.bn = nn.BatchNorm1d(512) self.dense2 = nn.Linear(512, n_class) def forward(self, x): x = x.unsqueeze(1) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = self.layer8(x) x = self.layer9(x) x = self.layer10(x) x = self.layer11(x) x = x.squeeze(-1) x = nn.ReLU()(self.bn(self.dense1(x))) x = self.dropout(x) x = self.dense2(x) x = nn.Sigmoid()(x) return x class ShortChunkCNN(nn.Module): ''' Short-chunk CNN architecture. So-called vgg-ish model with a small receptive field. Deeper layers, smaller pooling (2x2). ''' def __init__(self, n_channels=128, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=128, n_class=50): super(ShortChunkCNN, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # CNN self.layer1 = Conv_2d(1, n_channels, pooling=2) self.layer2 = Conv_2d(n_channels, n_channels, pooling=2) self.layer3 = Conv_2d(n_channels, n_channels*2, pooling=2) self.layer4 = Conv_2d(n_channels*2, n_channels*2, pooling=2) self.layer5 = Conv_2d(n_channels*2, n_channels*2, pooling=2) self.layer6 = Conv_2d(n_channels*2, n_channels*2, pooling=2) self.layer7 = Conv_2d(n_channels*2, n_channels*4, pooling=2) # Dense self.dense1 = nn.Linear(n_channels*4, n_channels*4) self.bn = nn.BatchNorm1d(n_channels*4) self.dense2 = nn.Linear(n_channels*4, n_class) self.dropout = nn.Dropout(0.5) self.relu = nn.ReLU() def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # CNN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = x.squeeze(2) # Global Max Pooling if x.size(-1) != 1: x = nn.MaxPool1d(x.size(-1))(x) x = x.squeeze(2) # Dense x = self.dense1(x) x = self.bn(x) x = self.relu(x) x = self.dropout(x) x = self.dense2(x) x = nn.Sigmoid()(x) return x class ShortChunkCNN_Res(nn.Module): ''' Short-chunk CNN architecture with residual connections. ''' def __init__(self, n_channels=128, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=128, n_class=50): super(ShortChunkCNN_Res, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # CNN self.layer1 = Res_2d(1, n_channels, stride=2) self.layer2 = Res_2d(n_channels, n_channels, stride=2) self.layer3 = Res_2d(n_channels, n_channels*2, stride=2) self.layer4 = Res_2d(n_channels*2, n_channels*2, stride=2) self.layer5 = Res_2d(n_channels*2, n_channels*2, stride=2) self.layer6 = Res_2d(n_channels*2, n_channels*2, stride=2) self.layer7 = Res_2d(n_channels*2, n_channels*4, stride=2) # Dense self.dense1 = nn.Linear(n_channels*4, n_channels*4) self.bn = nn.BatchNorm1d(n_channels*4) self.dense2 = nn.Linear(n_channels*4, n_class) self.dropout = nn.Dropout(0.5) self.relu = nn.ReLU() def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # CNN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = x.squeeze(2) # Global Max Pooling if x.size(-1) != 1: x = nn.MaxPool1d(x.size(-1))(x) x = x.squeeze(2) # Dense x = self.dense1(x) x = self.bn(x) x = self.relu(x) x = self.dropout(x) x = self.dense2(x) x = nn.Sigmoid()(x) return x class CNNSA(nn.Module): ''' Won et al. 2019 Toward interpretable music tagging with self-attention. Feature extraction with CNN + temporal summary with Transformer encoder. ''' def __init__(self, n_channels=128, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=128, n_class=50): super(CNNSA, self).__init__() # Spectrogram self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, f_min=f_min, f_max=f_max, n_mels=n_mels) self.to_db = torchaudio.transforms.AmplitudeToDB() self.spec_bn = nn.BatchNorm2d(1) # CNN self.layer1 = Res_2d(1, n_channels, stride=2) self.layer2 = Res_2d(n_channels, n_channels, stride=2) self.layer3 = Res_2d(n_channels, n_channels*2, stride=2) self.layer4 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1)) self.layer5 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1)) self.layer6 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1)) self.layer7 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1)) # Transformer encoder bert_config = BertConfig(vocab_size=256, hidden_size=256, num_hidden_layers=2, num_attention_heads=8, intermediate_size=1024, hidden_act="gelu", hidden_dropout_prob=0.4, max_position_embeddings=700, attention_probs_dropout_prob=0.5) self.encoder = BertEncoder(bert_config) self.pooler = BertPooler(bert_config) self.vec_cls = self.get_cls(256) # Dense self.dropout = nn.Dropout(0.5) self.dense = nn.Linear(256, n_class) def get_cls(self, channel): np.random.seed(0) single_cls = torch.Tensor(np.random.random((1, channel))) vec_cls = torch.cat([single_cls for _ in range(64)], dim=0) vec_cls = vec_cls.unsqueeze(1) return vec_cls def append_cls(self, x): batch, _, _ = x.size() part_vec_cls = self.vec_cls[:batch].clone() part_vec_cls = part_vec_cls.to(x.device) return torch.cat([part_vec_cls, x], dim=1) def forward(self, x): # Spectrogram x = self.spec(x) x = self.to_db(x) x = x.unsqueeze(1) x = self.spec_bn(x) # CNN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = x.squeeze(2) # Get [CLS] token x = x.permute(0, 2, 1) x = self.append_cls(x) # Transformer encoder x = self.encoder(x) x = x[-1] x = self.pooler(x) # Dense x = self.dropout(x) x = self.dense(x) x = nn.Sigmoid()(x) return x class HarmonicCNN(nn.Module): ''' Won et al. 2020 Data-driven harmonic filters for audio representation learning. Trainable harmonic band-pass filters, short-chunk CNN. ''' def __init__(self, n_channels=128, sample_rate=16000, n_fft=512, f_min=0.0, f_max=8000.0, n_mels=128, n_class=50, n_harmonic=6, semitone_scale=2, learn_bw='only_Q'): super(HarmonicCNN, self).__init__() # Harmonic STFT self.hstft = HarmonicSTFT(sample_rate=sample_rate, n_fft=n_fft, n_harmonic=n_harmonic, semitone_scale=semitone_scale, learn_bw=learn_bw) self.hstft_bn = nn.BatchNorm2d(n_harmonic) # CNN self.layer1 = Conv_2d(n_harmonic, n_channels, pooling=2) self.layer2 = Res_2d_mp(n_channels, n_channels, pooling=2) self.layer3 = Res_2d_mp(n_channels, n_channels, pooling=2) self.layer4 = Res_2d_mp(n_channels, n_channels, pooling=2) self.layer5 = Conv_2d(n_channels, n_channels*2, pooling=2) self.layer6 = Res_2d_mp(n_channels*2, n_channels*2, pooling=(2,3)) self.layer7 = Res_2d_mp(n_channels*2, n_channels*2, pooling=(2,3)) # Dense self.dense1 = nn.Linear(n_channels*2, n_channels*2) self.bn = nn.BatchNorm1d(n_channels*2) self.dense2 = nn.Linear(n_channels*2, n_class) self.dropout = nn.Dropout(0.5) self.relu = nn.ReLU() def forward(self, x): # Spectrogram x = self.hstft_bn(self.hstft(x)) # CNN x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = self.layer6(x) x = self.layer7(x) x = x.squeeze(2) # Global Max Pooling if x.size(-1) != 1: x = nn.MaxPool1d(x.size(-1))(x) x = x.squeeze(2) # Dense x = self.dense1(x) x = self.bn(x) x = self.relu(x) x = self.dropout(x) x = self.dense2(x) x = nn.Sigmoid()(x) return x