|  | """ | 
					
						
						|  | Implementation of model from: | 
					
						
						|  | Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using | 
					
						
						|  | Convolutional Recurrent Neural Networks" (2019) | 
					
						
						|  | Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d | 
					
						
						|  | """ | 
					
						
						|  | import torch | 
					
						
						|  | from torch import nn | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class JDCNet(nn.Module): | 
					
						
						|  | """ | 
					
						
						|  | Joint Detection and Classification Network model for singing voice melody. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01): | 
					
						
						|  | super().__init__() | 
					
						
						|  | self.num_class = num_class | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.conv_block = nn.Sequential( | 
					
						
						|  | nn.Conv2d( | 
					
						
						|  | in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False | 
					
						
						|  | ), | 
					
						
						|  | nn.BatchNorm2d(num_features=64), | 
					
						
						|  | nn.LeakyReLU(leaky_relu_slope, inplace=True), | 
					
						
						|  | nn.Conv2d(64, 64, 3, padding=1, bias=False), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.res_block1 = ResBlock( | 
					
						
						|  | in_channels=64, out_channels=128 | 
					
						
						|  | ) | 
					
						
						|  | self.res_block2 = ResBlock( | 
					
						
						|  | in_channels=128, out_channels=192 | 
					
						
						|  | ) | 
					
						
						|  | self.res_block3 = ResBlock(in_channels=192, out_channels=256) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.pool_block = nn.Sequential( | 
					
						
						|  | nn.BatchNorm2d(num_features=256), | 
					
						
						|  | nn.LeakyReLU(leaky_relu_slope, inplace=True), | 
					
						
						|  | nn.MaxPool2d(kernel_size=(1, 4)), | 
					
						
						|  | nn.Dropout(p=0.2), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40)) | 
					
						
						|  |  | 
					
						
						|  | self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20)) | 
					
						
						|  |  | 
					
						
						|  | self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.detector_conv = nn.Sequential( | 
					
						
						|  | nn.Conv2d(640, 256, 1, bias=False), | 
					
						
						|  | nn.BatchNorm2d(256), | 
					
						
						|  | nn.LeakyReLU(leaky_relu_slope, inplace=True), | 
					
						
						|  | nn.Dropout(p=0.2), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.bilstm_classifier = nn.LSTM( | 
					
						
						|  | input_size=512, hidden_size=256, batch_first=True, bidirectional=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.bilstm_detector = nn.LSTM( | 
					
						
						|  | input_size=512, hidden_size=256, batch_first=True, bidirectional=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.classifier = nn.Linear( | 
					
						
						|  | in_features=512, out_features=self.num_class | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.detector = nn.Linear( | 
					
						
						|  | in_features=512, out_features=2 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.apply(self.init_weights) | 
					
						
						|  |  | 
					
						
						|  | def get_feature_GAN(self, x): | 
					
						
						|  | seq_len = x.shape[-2] | 
					
						
						|  | x = x.float().transpose(-1, -2) | 
					
						
						|  |  | 
					
						
						|  | convblock_out = self.conv_block(x) | 
					
						
						|  |  | 
					
						
						|  | resblock1_out = self.res_block1(convblock_out) | 
					
						
						|  | resblock2_out = self.res_block2(resblock1_out) | 
					
						
						|  | resblock3_out = self.res_block3(resblock2_out) | 
					
						
						|  | poolblock_out = self.pool_block[0](resblock3_out) | 
					
						
						|  | poolblock_out = self.pool_block[1](poolblock_out) | 
					
						
						|  |  | 
					
						
						|  | return poolblock_out.transpose(-1, -2) | 
					
						
						|  |  | 
					
						
						|  | def get_feature(self, x): | 
					
						
						|  | seq_len = x.shape[-2] | 
					
						
						|  | x = x.float().transpose(-1, -2) | 
					
						
						|  |  | 
					
						
						|  | convblock_out = self.conv_block(x) | 
					
						
						|  |  | 
					
						
						|  | resblock1_out = self.res_block1(convblock_out) | 
					
						
						|  | resblock2_out = self.res_block2(resblock1_out) | 
					
						
						|  | resblock3_out = self.res_block3(resblock2_out) | 
					
						
						|  | poolblock_out = self.pool_block[0](resblock3_out) | 
					
						
						|  | poolblock_out = self.pool_block[1](poolblock_out) | 
					
						
						|  |  | 
					
						
						|  | return self.pool_block[2](poolblock_out) | 
					
						
						|  |  | 
					
						
						|  | def forward(self, x): | 
					
						
						|  | """ | 
					
						
						|  | Returns: | 
					
						
						|  | classification_prediction, detection_prediction | 
					
						
						|  | sizes: (b, 31, 722), (b, 31, 2) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | seq_len = x.shape[-1] | 
					
						
						|  | x = x.float().transpose(-1, -2) | 
					
						
						|  |  | 
					
						
						|  | convblock_out = self.conv_block(x) | 
					
						
						|  |  | 
					
						
						|  | resblock1_out = self.res_block1(convblock_out) | 
					
						
						|  | resblock2_out = self.res_block2(resblock1_out) | 
					
						
						|  | resblock3_out = self.res_block3(resblock2_out) | 
					
						
						|  |  | 
					
						
						|  | poolblock_out = self.pool_block[0](resblock3_out) | 
					
						
						|  | poolblock_out = self.pool_block[1](poolblock_out) | 
					
						
						|  | GAN_feature = poolblock_out.transpose(-1, -2) | 
					
						
						|  | poolblock_out = self.pool_block[2](poolblock_out) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | classifier_out = ( | 
					
						
						|  | poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512)) | 
					
						
						|  | ) | 
					
						
						|  | classifier_out, _ = self.bilstm_classifier( | 
					
						
						|  | classifier_out | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | classifier_out = classifier_out.contiguous().view((-1, 512)) | 
					
						
						|  | classifier_out = self.classifier(classifier_out) | 
					
						
						|  | classifier_out = classifier_out.view( | 
					
						
						|  | (-1, seq_len, self.num_class) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out | 
					
						
						|  |  | 
					
						
						|  | @staticmethod | 
					
						
						|  | def init_weights(m): | 
					
						
						|  | if isinstance(m, nn.Linear): | 
					
						
						|  | nn.init.kaiming_uniform_(m.weight) | 
					
						
						|  | if m.bias is not None: | 
					
						
						|  | nn.init.constant_(m.bias, 0) | 
					
						
						|  | elif isinstance(m, nn.Conv2d): | 
					
						
						|  | nn.init.xavier_normal_(m.weight) | 
					
						
						|  | elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell): | 
					
						
						|  | for p in m.parameters(): | 
					
						
						|  | if p.data is None: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | if len(p.shape) >= 2: | 
					
						
						|  | nn.init.orthogonal_(p.data) | 
					
						
						|  | else: | 
					
						
						|  | nn.init.normal_(p.data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class ResBlock(nn.Module): | 
					
						
						|  | def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01): | 
					
						
						|  | super().__init__() | 
					
						
						|  | self.downsample = in_channels != out_channels | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.pre_conv = nn.Sequential( | 
					
						
						|  | nn.BatchNorm2d(num_features=in_channels), | 
					
						
						|  | nn.LeakyReLU(leaky_relu_slope, inplace=True), | 
					
						
						|  | nn.MaxPool2d(kernel_size=(1, 2)), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.conv = nn.Sequential( | 
					
						
						|  | nn.Conv2d( | 
					
						
						|  | in_channels=in_channels, | 
					
						
						|  | out_channels=out_channels, | 
					
						
						|  | kernel_size=3, | 
					
						
						|  | padding=1, | 
					
						
						|  | bias=False, | 
					
						
						|  | ), | 
					
						
						|  | nn.BatchNorm2d(out_channels), | 
					
						
						|  | nn.LeakyReLU(leaky_relu_slope, inplace=True), | 
					
						
						|  | nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.conv1by1 = None | 
					
						
						|  | if self.downsample: | 
					
						
						|  | self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) | 
					
						
						|  |  | 
					
						
						|  | def forward(self, x): | 
					
						
						|  | x = self.pre_conv(x) | 
					
						
						|  | if self.downsample: | 
					
						
						|  | x = self.conv(x) + self.conv1by1(x) | 
					
						
						|  | else: | 
					
						
						|  | x = self.conv(x) + x | 
					
						
						|  | return x | 
					
						
						|  |  |