Spaces:

asigalov61
/

ByteDance-Solo-Piano-Audio-to-MIDI-Transcription

Running on Zero

App Files Files Community

asigalov61 commited on Apr 6

Commit

dcca7d2

•

1 Parent(s): 6927483

Upload 6 files

Browse files

Files changed (6) hide show

config.py +7 -0
inference.py +171 -0
models.py +353 -0
piano_vad.py +130 -0
pytorch_utils.py +66 -0
utilities.py +564 -0

config.py ADDED Viewed

	@@ -0,0 +1,7 @@

+sample_rate = 16000
+classes_num = 88    # Number of notes of piano
+begin_note = 21     # MIDI note of A0, the lowest note of a piano.
+segment_seconds = 10.	# Training segment duration
+hop_seconds = 1.
+frames_per_second = 100
+velocity_scale = 128

inference.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import numpy as np
+import time
+import librosa
+from pathlib import Path
+import torch
+from .utilities import (create_folder, get_filename, RegressionPostProcessor,
+    write_events_to_midi)
+from .models import Regress_onset_offset_frame_velocity_CRNN, Note_pedal
+from .pytorch_utils import move_data_to_device, forward
+from . import config
+class PianoTranscription(object):
+    def __init__(self, model_type='Note_pedal', checkpoint_path=None,
+        segment_samples=16000*10, device=torch.device('cuda')):
+        """Class for transcribing piano solo recording.
+        Args:
+          model_type: str
+          checkpoint_path: str
+          segment_samples: int
+          device: 'cuda' | 'cpu'
+        """
+        if not checkpoint_path:
+            checkpoint_path='{}/piano_transcription_inference_data/note_F1=0.9677_pedal_F1=0.9186.pth'.format(str(Path.home()))
+        print('Checkpoint path: {}'.format(checkpoint_path))
+        if not os.path.exists(checkpoint_path) or os.path.getsize(checkpoint_path) < 1.6e8:
+            create_folder(os.path.dirname(checkpoint_path))
+            print('Total size: ~165 MB')
+            zenodo_path = 'https://zenodo.org/record/4034264/files/CRNN_note_F1%3D0.9677_pedal_F1%3D0.9186.pth?download=1'
+            os.system('wget -O "{}" "{}"'.format(checkpoint_path, zenodo_path))
+        print('Using {} for inference.'.format(device))
+        self.segment_samples = segment_samples
+        self.frames_per_second = config.frames_per_second
+        self.classes_num = config.classes_num
+        self.onset_threshold = 0.3
+        self.offset_threshod = 0.3
+        self.frame_threshold = 0.1
+        self.pedal_offset_threshold = 0.2
+        # Build model
+        Model = eval(model_type)
+        self.model = Model(frames_per_second=self.frames_per_second,
+            classes_num=self.classes_num)
+        # Load model
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        self.model.load_state_dict(checkpoint['model'], strict=False)
+        # Parallel
+        if 'cuda' in str(device):
+            self.model.to(device)
+            print('GPU number: {}'.format(torch.cuda.device_count()))
+            self.model = torch.nn.DataParallel(self.model)
+        else:
+            print('Using CPU.')
+    def transcribe(self, audio, midi_path):
+        """Transcribe an audio recording.
+        Args:
+          audio: (audio_samples,)
+          midi_path: str, path to write out the transcribed MIDI.
+        Returns:
+          transcribed_dict, dict: {'output_dict':, ..., 'est_note_events': ...}
+        """
+        audio = audio[None, :]  # (1, audio_samples)
+        # Pad audio to be evenly divided by segment_samples
+        audio_len = audio.shape[1]
+        pad_len = int(np.ceil(audio_len / self.segment_samples))\
+            * self.segment_samples - audio_len
+        audio = np.concatenate((audio, np.zeros((1, pad_len))), axis=1)
+        # Enframe to segments
+        segments = self.enframe(audio, self.segment_samples)
+        """(N, segment_samples)"""
+        # Forward
+        output_dict = forward(self.model, segments, batch_size=1)
+        """{'reg_onset_output': (N, segment_frames, classes_num), ...}"""
+        # Deframe to original length
+        for key in output_dict.keys():
+            output_dict[key] = self.deframe(output_dict[key])[0 : audio_len]
+        """output_dict: {
+          'reg_onset_output': (N, segment_frames, classes_num),
+          'reg_offset_output': (N, segment_frames, classes_num),
+          'frame_output': (N, segment_frames, classes_num),
+          'velocity_output': (N, segment_frames, classes_num)}"""
+        # Post processor
+        post_processor = RegressionPostProcessor(self.frames_per_second,
+            classes_num=self.classes_num, onset_threshold=self.onset_threshold,
+            offset_threshold=self.offset_threshod,
+            frame_threshold=self.frame_threshold,
+            pedal_offset_threshold=self.pedal_offset_threshold)
+        # Post process output_dict to MIDI events
+        (est_note_events, est_pedal_events) = \
+            post_processor.output_dict_to_midi_events(output_dict)
+        # Write MIDI events to file
+        if midi_path:
+            write_events_to_midi(start_time=0, note_events=est_note_events,
+                pedal_events=est_pedal_events, midi_path=midi_path)
+            print('Write out to {}'.format(midi_path))
+        transcribed_dict = {
+            'output_dict': output_dict,
+            'est_note_events': est_note_events,
+            'est_pedal_events': est_pedal_events}
+        return transcribed_dict
+    def enframe(self, x, segment_samples):
+        """Enframe long sequence to short segments.
+        Args:
+          x: (1, audio_samples)
+          segment_samples: int
+        Returns:
+          batch: (N, segment_samples)
+        """
+        assert x.shape[1] % segment_samples == 0
+        batch = []
+        pointer = 0
+        while pointer + segment_samples <= x.shape[1]:
+            batch.append(x[:, pointer : pointer + segment_samples])
+            pointer += segment_samples // 2
+        batch = np.concatenate(batch, axis=0)
+        return batch
+    def deframe(self, x):
+        """Deframe predicted segments to original sequence.
+        Args:
+          x: (N, segment_frames, classes_num)
+        Returns:
+          y: (audio_frames, classes_num)
+        """
+        if x.shape[0] == 1:
+            return x[0]
+        else:
+            x = x[:, 0 : -1, :]
+            """Remove an extra frame in the end of each segment caused by the
+            'center=True' argument when calculating spectrogram."""
+            (N, segment_samples, classes_num) = x.shape
+            assert segment_samples % 4 == 0
+            y = []
+            y.append(x[0, 0 : int(segment_samples * 0.75)])
+            for i in range(1, N - 1):
+                y.append(x[i, int(segment_samples * 0.25) : int(segment_samples * 0.75)])
+            y.append(x[-1, int(segment_samples * 0.25) :])
+            y = np.concatenate(y, axis=0)
+            return y

models.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+import sys
+import math
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from .pytorch_utils import move_data_to_device
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+def init_gru(rnn):
+    """Initialize a GRU layer. """
+    def _concat_init(tensor, init_funcs):
+        (length, fan_out) = tensor.shape
+        fan_in = length // len(init_funcs)
+        for (i, init_func) in enumerate(init_funcs):
+            init_func(tensor[i * fan_in : (i + 1) * fan_in, :])
+    def _inner_uniform(tensor):
+        fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in')
+        nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
+    for i in range(rnn.num_layers):
+        _concat_init(
+            getattr(rnn, 'weight_ih_l{}'.format(i)),
+            [_inner_uniform, _inner_uniform, _inner_uniform]
+        )
+        torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0)
+        _concat_init(
+            getattr(rnn, 'weight_hh_l{}'.format(i)),
+            [_inner_uniform, _inner_uniform, nn.init.orthogonal_]
+        )
+        torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0)
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels, momentum)
+        self.bn2 = nn.BatchNorm2d(out_channels, momentum)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Args:
+          input: (batch_size, in_channels, time_steps, freq_bins)
+        Outputs:
+          output: (batch_size, out_channels, classes_num)
+        """
+        x = F.relu_(self.bn1(self.conv1(input)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        return x
+class AcousticModelCRnn8Dropout(nn.Module):
+    def __init__(self, classes_num, midfeat, momentum):
+        super(AcousticModelCRnn8Dropout, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=48, momentum=momentum)
+        self.conv_block2 = ConvBlock(in_channels=48, out_channels=64, momentum=momentum)
+        self.conv_block3 = ConvBlock(in_channels=64, out_channels=96, momentum=momentum)
+        self.conv_block4 = ConvBlock(in_channels=96, out_channels=128, momentum=momentum)
+        self.fc5 = nn.Linear(midfeat, 768, bias=False)
+        self.bn5 = nn.BatchNorm1d(768, momentum=momentum)
+        self.gru = nn.GRU(input_size=768, hidden_size=256, num_layers=2,
+            bias=True, batch_first=True, dropout=0., bidirectional=True)
+        self.fc = nn.Linear(512, classes_num, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.fc5)
+        init_bn(self.bn5)
+        init_gru(self.gru)
+        init_layer(self.fc)
+    def forward(self, input):
+        """
+        Args:
+          input: (batch_size, channels_num, time_steps, freq_bins)
+        Outputs:
+          output: (batch_size, time_steps, classes_num)
+        """
+        x = self.conv_block1(input, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = x.transpose(1, 2).flatten(2)
+        x = F.relu(self.bn5(self.fc5(x).transpose(1, 2)).transpose(1, 2))
+        x = F.dropout(x, p=0.5, training=self.training, inplace=True)
+        (x, _) = self.gru(x)
+        x = F.dropout(x, p=0.5, training=self.training, inplace=False)
+        output = torch.sigmoid(self.fc(x))
+        return output
+class Regress_onset_offset_frame_velocity_CRNN(nn.Module):
+    def __init__(self, frames_per_second, classes_num):
+        super(Regress_onset_offset_frame_velocity_CRNN, self).__init__()
+        sample_rate = 16000
+        window_size = 2048
+        hop_size = sample_rate // frames_per_second
+        mel_bins = 229
+        fmin = 30
+        fmax = sample_rate // 2
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        midfeat = 1792
+        momentum = 0.01
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
+            hop_length=hop_size, win_length=window_size, window=window,
+            center=center, pad_mode=pad_mode, freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
+            n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref,
+            amin=amin, top_db=top_db, freeze_parameters=True)
+        self.bn0 = nn.BatchNorm2d(mel_bins, momentum)
+        self.frame_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
+        self.reg_onset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
+        self.reg_offset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
+        self.velocity_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum)
+        self.reg_onset_gru = nn.GRU(input_size=88 * 2, hidden_size=256, num_layers=1,
+            bias=True, batch_first=True, dropout=0., bidirectional=True)
+        self.reg_onset_fc = nn.Linear(512, classes_num, bias=True)
+        self.frame_gru = nn.GRU(input_size=88 * 3, hidden_size=256, num_layers=1,
+            bias=True, batch_first=True, dropout=0., bidirectional=True)
+        self.frame_fc = nn.Linear(512, classes_num, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_gru(self.reg_onset_gru)
+        init_gru(self.frame_gru)
+        init_layer(self.reg_onset_fc)
+        init_layer(self.frame_fc)
+    def forward(self, input):
+        """
+        Args:
+          input: (batch_size, data_length)
+        Outputs:
+          output_dict: dict, {
+            'reg_onset_output': (batch_size, time_steps, classes_num),
+            'reg_offset_output': (batch_size, time_steps, classes_num),
+            'frame_output': (batch_size, time_steps, classes_num),
+            'velocity_output': (batch_size, time_steps, classes_num)
+          }
+        """
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        frame_output = self.frame_model(x)  # (batch_size, time_steps, classes_num)
+        reg_onset_output = self.reg_onset_model(x)  # (batch_size, time_steps, classes_num)
+        reg_offset_output = self.reg_offset_model(x)    # (batch_size, time_steps, classes_num)
+        velocity_output = self.velocity_model(x)    # (batch_size, time_steps, classes_num)
+        # Use velocities to condition onset regression
+        x = torch.cat((reg_onset_output, (reg_onset_output ** 0.5) * velocity_output.detach()), dim=2)
+        (x, _) = self.reg_onset_gru(x)
+        x = F.dropout(x, p=0.5, training=self.training, inplace=False)
+        reg_onset_output = torch.sigmoid(self.reg_onset_fc(x))
+        """(batch_size, time_steps, classes_num)"""
+        # Use onsets and offsets to condition frame-wise classification
+        x = torch.cat((frame_output, reg_onset_output.detach(), reg_offset_output.detach()), dim=2)
+        (x, _) = self.frame_gru(x)
+        x = F.dropout(x, p=0.5, training=self.training, inplace=False)
+        frame_output = torch.sigmoid(self.frame_fc(x))  # (batch_size, time_steps, classes_num)
+        """(batch_size, time_steps, classes_num)"""
+        output_dict = {
+            'reg_onset_output': reg_onset_output,
+            'reg_offset_output': reg_offset_output,
+            'frame_output': frame_output,
+            'velocity_output': velocity_output}
+        return output_dict
+class Regress_pedal_CRNN(nn.Module):
+    def __init__(self, frames_per_second, classes_num):
+        super(Regress_pedal_CRNN, self).__init__()
+        sample_rate = 16000
+        window_size = 2048
+        hop_size = sample_rate // frames_per_second
+        mel_bins = 229
+        fmin = 30
+        fmax = sample_rate // 2
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        midfeat = 1792
+        momentum = 0.01
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
+            hop_length=hop_size, win_length=window_size, window=window,
+            center=center, pad_mode=pad_mode, freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
+            n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref,
+            amin=amin, top_db=top_db, freeze_parameters=True)
+        self.bn0 = nn.BatchNorm2d(mel_bins, momentum)
+        self.reg_pedal_onset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
+        self.reg_pedal_offset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
+        self.reg_pedal_frame_model = AcousticModelCRnn8Dropout(1, midfeat, momentum)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+    def forward(self, input):
+        """
+        Args:
+          input: (batch_size, data_length)
+        Outputs:
+          output_dict: dict, {
+            'reg_onset_output': (batch_size, time_steps, classes_num),
+            'reg_offset_output': (batch_size, time_steps, classes_num),
+            'frame_output': (batch_size, time_steps, classes_num),
+            'velocity_output': (batch_size, time_steps, classes_num)
+          }
+        """
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        reg_pedal_onset_output = self.reg_pedal_onset_model(x)  # (batch_size, time_steps, classes_num)
+        reg_pedal_offset_output = self.reg_pedal_offset_model(x)  # (batch_size, time_steps, classes_num)
+        pedal_frame_output = self.reg_pedal_frame_model(x)  # (batch_size, time_steps, classes_num)
+        output_dict = {
+            'reg_pedal_onset_output': reg_pedal_onset_output,
+            'reg_pedal_offset_output': reg_pedal_offset_output,
+            'pedal_frame_output': pedal_frame_output}
+        return output_dict
+# This model is not trained, but is combined from the trained note and pedal models.
+class Note_pedal(nn.Module):
+    def __init__(self, frames_per_second, classes_num):
+        """The combination of note and pedal model.
+        """
+        super(Note_pedal, self).__init__()
+        self.note_model = Regress_onset_offset_frame_velocity_CRNN(frames_per_second, classes_num)
+        self.pedal_model = Regress_pedal_CRNN(frames_per_second, classes_num)
+    def load_state_dict(self, m, strict=False):
+        self.note_model.load_state_dict(m['note_model'], strict=strict)
+        self.pedal_model.load_state_dict(m['pedal_model'], strict=strict)
+    def forward(self, input):
+        note_output_dict = self.note_model(input)
+        pedal_output_dict = self.pedal_model(input)
+        full_output_dict = {}
+        full_output_dict.update(note_output_dict)
+        full_output_dict.update(pedal_output_dict)
+        return full_output_dict

piano_vad.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import numpy as np
+def note_detection_with_onset_offset_regress(frame_output, onset_output,
+    onset_shift_output, offset_output, offset_shift_output, velocity_output,
+    frame_threshold):
+    """Process prediction matrices to note events information.
+    First, detect onsets with onset outputs. Then, detect offsets
+    with frame and offset outputs.
+    Args:
+      frame_output: (frames_num,)
+      onset_output: (frames_num,)
+      onset_shift_output: (frames_num,)
+      offset_output: (frames_num,)
+      offset_shift_output: (frames_num,)
+      velocity_output: (frames_num,)
+      frame_threshold: float
+    Returns:
+      output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
+      e.g., [
+        [1821, 1909, 0.47498, 0.3048533, 0.72119445],
+        [1909, 1947, 0.30730522, -0.45764327, 0.64200014],
+        ...]
+    """
+    output_tuples = []
+    bgn = None
+    frame_disappear = None
+    offset_occur = None
+    for i in range(onset_output.shape[0]):
+        if onset_output[i] == 1:
+            """Onset detected"""
+            if bgn:
+                """Consecutive onsets. E.g., pedal is not released, but two
+                consecutive notes being played."""
+                fin = max(i - 1, 0)
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                    0, velocity_output[bgn]])
+                frame_disappear, offset_occur = None, None
+            bgn = i
+        if bgn and i > bgn:
+            """If onset found, then search offset"""
+            if frame_output[i] <= frame_threshold and not frame_disappear:
+                """Frame disappear detected"""
+                frame_disappear = i
+            if offset_output[i] == 1 and not offset_occur:
+                """Offset detected"""
+                offset_occur = i
+            if frame_disappear:
+                if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
+                    """bgn --------- offset_occur --- frame_disappear"""
+                    fin = offset_occur
+                else:
+                    """bgn --- offset_occur --------- frame_disappear"""
+                    fin = frame_disappear
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                    offset_shift_output[fin], velocity_output[bgn]])
+                bgn, frame_disappear, offset_occur = None, None, None
+            if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
+                """Offset not detected"""
+                fin = i
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                    offset_shift_output[fin], velocity_output[bgn]])
+                bgn, frame_disappear, offset_occur = None, None, None
+    # Sort pairs by onsets
+    output_tuples.sort(key=lambda pair: pair[0])
+    return output_tuples
+def pedal_detection_with_onset_offset_regress(frame_output, offset_output,
+    offset_shift_output, frame_threshold):
+    """Process prediction array to pedal events information.
+    Args:
+      frame_output: (frames_num,)
+      offset_output: (frames_num,)
+      offset_shift_output: (frames_num,)
+      frame_threshold: float
+    Returns:
+      output_tuples: list of [bgn, fin, onset_shift, offset_shift],
+      e.g., [
+        [1821, 1909, 0.4749851, 0.3048533],
+        [1909, 1947, 0.30730522, -0.45764327],
+        ...]
+    """
+    output_tuples = []
+    bgn = None
+    frame_disappear = None
+    offset_occur = None
+    for i in range(1, frame_output.shape[0]):
+        if frame_output[i] >= frame_threshold and frame_output[i] > frame_output[i - 1]:
+            """Pedal onset detected"""
+            if bgn:
+                pass
+            else:
+                bgn = i
+        if bgn and i > bgn:
+            """If onset found, then search offset"""
+            if frame_output[i] <= frame_threshold and not frame_disappear:
+                """Frame disappear detected"""
+                frame_disappear = i
+            if offset_output[i] == 1 and not offset_occur:
+                """Offset detected"""
+                offset_occur = i
+            if offset_occur:
+                fin = offset_occur
+                output_tuples.append([bgn, fin, 0., offset_shift_output[fin]])
+                bgn, frame_disappear, offset_occur = None, None, None
+            if frame_disappear and i - frame_disappear >= 10:
+                """offset not detected but frame disappear"""
+                fin = frame_disappear
+                output_tuples.append([bgn, fin, 0., offset_shift_output[fin]])
+                bgn, frame_disappear, offset_occur = None, None, None
+    # Sort pairs by onsets
+    output_tuples.sort(key=lambda pair: pair[0])
+    return output_tuples

pytorch_utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import numpy as np
+import time
+import torch
+from .utilities import pad_truncate_sequence
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+    return x.to(device)
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+def forward(model, x, batch_size):
+    """Forward data to model in mini-batch.
+    Args:
+      model: object
+      x: (N, segment_samples)
+      batch_size: int
+    Returns:
+      output_dict: dict, e.g. {
+        'frame_output': (segments_num, frames_num, classes_num),
+        'onset_output': (segments_num, frames_num, classes_num),
+        ...}
+    """
+    output_dict = {}
+    device = next(model.parameters()).device
+    pointer = 0
+    total_segments = int(np.ceil(len(x) / batch_size))
+    while True:
+        print('Segment {} / {}'.format(pointer, total_segments))
+        if pointer >= len(x):
+            break
+        batch_waveform = move_data_to_device(x[pointer : pointer + batch_size], device)
+        pointer += batch_size
+        with torch.no_grad():
+            model.eval()
+            batch_output_dict = model(batch_waveform)
+        for key in batch_output_dict.keys():
+            append_to_dict(output_dict, key, batch_output_dict[key].data.cpu().numpy())
+    for key in output_dict.keys():
+        output_dict[key] = np.concatenate(output_dict[key], axis=0)
+    return output_dict

utilities.py ADDED Viewed

	@@ -0,0 +1,564 @@

+import os
+import numpy as np
+import audioread
+import librosa
+from mido import MidiFile
+from .piano_vad import (note_detection_with_onset_offset_regress,
+    pedal_detection_with_onset_offset_regress)
+from . import config
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+def get_filename(path):
+    path = os.path.realpath(path)
+    na_ext = path.split('/')[-1]
+    na = os.path.splitext(na_ext)[0]
+    return na
+def note_to_freq(piano_note):
+    return 2 ** ((piano_note - 39) / 12) * 440
+def float32_to_int16(x):
+    assert np.max(np.abs(x)) <= 1.
+    return (x * 32767.).astype(np.int16)
+def int16_to_float32(x):
+    return (x / 32767.).astype(np.float32)
+def pad_truncate_sequence(x, max_len):
+    if len(x) < max_len:
+        return np.concatenate((x, np.zeros(max_len - len(x))))
+    else:
+        return x[0 : max_len]
+def read_midi(midi_path):
+    """Parse MIDI file.
+    Args:
+      midi_path: str
+    Returns:
+      midi_dict: dict, e.g. {
+        'midi_event': [
+            'program_change channel=0 program=0 time=0',
+            'control_change channel=0 control=64 value=127 time=0',
+            'control_change channel=0 control=64 value=63 time=236',
+            ...],
+        'midi_event_time': [0., 0, 0.98307292, ...]}
+    """
+    midi_file = MidiFile(midi_path)
+    ticks_per_beat = midi_file.ticks_per_beat
+    assert len(midi_file.tracks) == 2
+    """The first track contains tempo, time signature. The second track
+    contains piano events."""
+    microseconds_per_beat = midi_file.tracks[0][0].tempo
+    beats_per_second = 1e6 / microseconds_per_beat
+    ticks_per_second = ticks_per_beat * beats_per_second
+    message_list = []
+    ticks = 0
+    time_in_second = []
+    for message in midi_file.tracks[1]:
+        message_list.append(str(message))
+        ticks += message.time
+        time_in_second.append(ticks / ticks_per_second)
+    midi_dict = {
+        'midi_event': np.array(message_list),
+        'midi_event_time': np.array(time_in_second)}
+    return midi_dict
+def write_events_to_midi(start_time, note_events, pedal_events, midi_path):
+    """Write out note events to MIDI file.
+    Args:
+      start_time: float
+      note_events: list of dict, e.g. [
+        {'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44},
+        {'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50}
+        ...]
+      midi_path: str
+    """
+    from mido import Message, MidiFile, MidiTrack, MetaMessage
+    # This configuration is the same as MIDIs in MAESTRO dataset
+    ticks_per_beat = 384
+    beats_per_second = 2
+    ticks_per_second = ticks_per_beat * beats_per_second
+    microseconds_per_beat = int(1e6 // beats_per_second)
+    midi_file = MidiFile()
+    midi_file.ticks_per_beat = ticks_per_beat
+    # Track 0
+    track0 = MidiTrack()
+    track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0))
+    track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0))
+    track0.append(MetaMessage('end_of_track', time=1))
+    midi_file.tracks.append(track0)
+    # Track 1
+    track1 = MidiTrack()
+    # Message rolls of MIDI
+    message_roll = []
+    for note_event in note_events:
+        # Onset
+        message_roll.append({
+            'time': note_event['onset_time'],
+            'midi_note': note_event['midi_note'],
+            'velocity': note_event['velocity']})
+        # Offset
+        message_roll.append({
+            'time': note_event['offset_time'],
+            'midi_note': note_event['midi_note'],
+            'velocity': 0})
+    if pedal_events:
+        for pedal_event in pedal_events:
+            message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127})
+            message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0})
+    # Sort MIDI messages by time
+    message_roll.sort(key=lambda note_event: note_event['time'])
+    previous_ticks = 0
+    for message in message_roll:
+        this_ticks = int((message['time'] - start_time) * ticks_per_second)
+        if this_ticks >= 0:
+            diff_ticks = this_ticks - previous_ticks
+            previous_ticks = this_ticks
+            if 'midi_note' in message.keys():
+                track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks))
+            elif 'control_change' in message.keys():
+                track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks))
+    track1.append(MetaMessage('end_of_track', time=1))
+    midi_file.tracks.append(track1)
+    midi_file.save(midi_path)
+class RegressionPostProcessor(object):
+    def __init__(self, frames_per_second, classes_num, onset_threshold,
+        offset_threshold, frame_threshold, pedal_offset_threshold):
+        """Postprocess the output probabilities of a transription model to MIDI
+        events.
+        Args:
+          frames_per_second: int
+          classes_num: int
+          onset_threshold: float
+          offset_threshold: float
+          frame_threshold: float
+          pedal_offset_threshold: float
+        """
+        self.frames_per_second = frames_per_second
+        self.classes_num = classes_num
+        self.onset_threshold = onset_threshold
+        self.offset_threshold = offset_threshold
+        self.frame_threshold = frame_threshold
+        self.pedal_offset_threshold = pedal_offset_threshold
+        self.begin_note = config.begin_note
+        self.velocity_scale = config.velocity_scale
+    def output_dict_to_midi_events(self, output_dict):
+        """Main function. Post process model outputs to MIDI events.
+        Args:
+          output_dict: {
+            'reg_onset_output': (segment_frames, classes_num),
+            'reg_offset_output': (segment_frames, classes_num),
+            'frame_output': (segment_frames, classes_num),
+            'velocity_output': (segment_frames, classes_num),
+            'reg_pedal_onset_output': (segment_frames, 1),
+            'reg_pedal_offset_output': (segment_frames, 1),
+            'pedal_frame_output': (segment_frames, 1)}
+        Outputs:
+          est_note_events: list of dict, e.g. [
+            {'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
+            {'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]
+          est_pedal_events: list of dict, e.g. [
+            {'onset_time': 0.17, 'offset_time': 0.96},
+            {'osnet_time': 1.17, 'offset_time': 2.65}]
+        """
+        # Post process piano note outputs to piano note and pedal events information
+        (est_on_off_note_vels, est_pedal_on_offs) = \
+            self.output_dict_to_note_pedal_arrays(output_dict)
+        """est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
+        est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""
+        # Reformat notes to MIDI events
+        est_note_events = self.detected_notes_to_events(est_on_off_note_vels)
+        if est_pedal_on_offs is None:
+            est_pedal_events = None
+        else:
+            est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)
+        return est_note_events, est_pedal_events
+    def output_dict_to_note_pedal_arrays(self, output_dict):
+        """Postprocess the output probabilities of a transription model to MIDI
+        events.
+        Args:
+          output_dict: dict, {
+            'reg_onset_output': (frames_num, classes_num),
+            'reg_offset_output': (frames_num, classes_num),
+            'frame_output': (frames_num, classes_num),
+            'velocity_output': (frames_num, classes_num),
+            ...}
+        Returns:
+          est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
+            offset_time, piano_note and velocity. E.g. [
+             [39.74, 39.87, 27, 0.65],
+             [11.98, 12.11, 33, 0.69],
+             ...]
+          est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
+            and offset_time. E.g. [
+             [0.17, 0.96],
+             [1.17, 2.65],
+             ...]
+        """
+        # ------ 1. Process regression outputs to binarized outputs ------
+        # For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
+        # will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]
+        # Calculate binarized onset output from regression output
+        (onset_output, onset_shift_output) = \
+            self.get_binarized_output_from_regression(
+                reg_output=output_dict['reg_onset_output'],
+                threshold=self.onset_threshold, neighbour=2)
+        output_dict['onset_output'] = onset_output  # Values are 0 or 1
+        output_dict['onset_shift_output'] = onset_shift_output
+        # Calculate binarized offset output from regression output
+        (offset_output, offset_shift_output) = \
+            self.get_binarized_output_from_regression(
+                reg_output=output_dict['reg_offset_output'],
+                threshold=self.offset_threshold, neighbour=4)
+        output_dict['offset_output'] = offset_output  # Values are 0 or 1
+        output_dict['offset_shift_output'] = offset_shift_output
+        if 'reg_pedal_onset_output' in output_dict.keys():
+            """Pedal onsets are not used in inference. Instead, frame-wise pedal
+            predictions are used to detect onsets. We empirically found this is
+            more accurate to detect pedal onsets."""
+            pass
+        if 'reg_pedal_offset_output' in output_dict.keys():
+            # Calculate binarized pedal offset output from regression output
+            (pedal_offset_output, pedal_offset_shift_output) = \
+                self.get_binarized_output_from_regression(
+                    reg_output=output_dict['reg_pedal_offset_output'],
+                    threshold=self.pedal_offset_threshold, neighbour=4)
+            output_dict['pedal_offset_output'] = pedal_offset_output  # Values are 0 or 1
+            output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output
+        # ------ 2. Process matrices results to event results ------
+        # Detect piano notes from output_dict
+        est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)
+        if 'reg_pedal_onset_output' in output_dict.keys():
+            # Detect piano pedals from output_dict
+            est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict)
+        else:
+            est_pedal_on_offs = None
+        return est_on_off_note_vels, est_pedal_on_offs
+    def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
+        """Calculate binarized output and shifts of onsets or offsets from the
+        regression results.
+        Args:
+          reg_output: (frames_num, classes_num)
+          threshold: float
+          neighbour: int
+        Returns:
+          binary_output: (frames_num, classes_num)
+          shift_output: (frames_num, classes_num)
+        """
+        binary_output = np.zeros_like(reg_output)
+        shift_output = np.zeros_like(reg_output)
+        (frames_num, classes_num) = reg_output.shape
+        for k in range(classes_num):
+            x = reg_output[:, k]
+            for n in range(neighbour, frames_num - neighbour):
+                if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
+                    binary_output[n, k] = 1
+                    """See Section III-D in [1] for deduction.
+                    [1] Q. Kong, et al., High-resolution Piano Transcription
+                    with Pedals by Regressing Onsets and Offsets Times, 2020."""
+                    if x[n - 1] > x[n + 1]:
+                        shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
+                    else:
+                        shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
+                    shift_output[n, k] = shift
+        return binary_output, shift_output
+    def is_monotonic_neighbour(self, x, n, neighbour):
+        """Detect if values are monotonic in both side of x[n].
+        Args:
+          x: (frames_num,)
+          n: int
+          neighbour: int
+        Returns:
+          monotonic: bool
+        """
+        monotonic = True
+        for i in range(neighbour):
+            if x[n - i] < x[n - i - 1]:
+                monotonic = False
+            if x[n + i] < x[n + i + 1]:
+                monotonic = False
+        return monotonic
+    def output_dict_to_detected_notes(self, output_dict):
+        """Postprocess output_dict to piano notes.
+        Args:
+          output_dict: dict, e.g. {
+            'onset_output': (frames_num, classes_num),
+            'onset_shift_output': (frames_num, classes_num),
+            'offset_output': (frames_num, classes_num),
+            'offset_shift_output': (frames_num, classes_num),
+            'frame_output': (frames_num, classes_num),
+            'onset_output': (frames_num, classes_num),
+            ...}
+        Returns:
+          est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
+          MIDI notes and velocities. E.g.,
+            [[39.7375, 39.7500, 27., 0.6638],
+             [11.9824, 12.5000, 33., 0.6892],
+             ...]
+        """
+        est_tuples = []
+        est_midi_notes = []
+        classes_num = output_dict['frame_output'].shape[-1]
+        for piano_note in range(classes_num):
+            """Detect piano notes"""
+            est_tuples_per_note = note_detection_with_onset_offset_regress(
+                frame_output=output_dict['frame_output'][:, piano_note],
+                onset_output=output_dict['onset_output'][:, piano_note],
+                onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
+                offset_output=output_dict['offset_output'][:, piano_note],
+                offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
+                velocity_output=output_dict['velocity_output'][:, piano_note],
+                frame_threshold=self.frame_threshold)
+            est_tuples += est_tuples_per_note
+            est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)
+        est_tuples = np.array(est_tuples)   # (notes, 5)
+        """(notes, 5), the five columns are onset, offset, onset_shift,
+        offset_shift and normalized_velocity"""
+        est_midi_notes = np.array(est_midi_notes) # (notes,)
+        if len(est_tuples) == 0:
+            return np.array([])
+        else:
+            onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
+            offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
+            velocities = est_tuples[:, 4]
+            est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
+            """(notes, 3), the three columns are onset_times, offset_times and velocity."""
+            est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)
+            return est_on_off_note_vels
+    def output_dict_to_detected_pedals(self, output_dict):
+        """Postprocess output_dict to piano pedals.
+        Args:
+          output_dict: dict, e.g. {
+            'pedal_frame_output': (frames_num,),
+            'pedal_offset_output': (frames_num,),
+            'pedal_offset_shift_output': (frames_num,),
+            ...}
+        Returns:
+          est_on_off: (notes, 2), the two columns are pedal onsets and pedal
+            offsets. E.g.,
+              [[0.1800, 0.9669],
+               [1.1400, 2.6458],
+               ...]
+        """
+        frames_num = output_dict['pedal_frame_output'].shape[0]
+        est_tuples = pedal_detection_with_onset_offset_regress(
+            frame_output=output_dict['pedal_frame_output'][:, 0],
+            offset_output=output_dict['pedal_offset_output'][:, 0],
+            offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0],
+            frame_threshold=0.5)
+        est_tuples = np.array(est_tuples)
+        """(notes, 2), the two columns are pedal onsets and pedal offsets"""
+        if len(est_tuples) == 0:
+            return np.array([])
+        else:
+            onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
+            offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
+            est_on_off = np.stack((onset_times, offset_times), axis=-1)
+            est_on_off = est_on_off.astype(np.float32)
+            return est_on_off
+    def detected_notes_to_events(self, est_on_off_note_vels):
+        """Reformat detected notes to midi events.
+        Args:
+          est_on_off_vels: (notes, 3), the three columns are onset_times,
+            offset_times and velocity. E.g.
+            [[32.8376, 35.7700, 0.7932],
+             [37.3712, 39.9300, 0.8058],
+             ...]
+        Returns:
+          midi_events, list, e.g.,
+            [{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
+             {'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
+             ...]
+        """
+        midi_events = []
+        for i in range(est_on_off_note_vels.shape[0]):
+            midi_events.append({
+                'onset_time': est_on_off_note_vels[i][0],
+                'offset_time': est_on_off_note_vels[i][1],
+                'midi_note': int(est_on_off_note_vels[i][2]),
+                'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})
+        return midi_events
+    def detected_pedals_to_events(self, pedal_on_offs):
+        """Reformat detected pedal onset and offsets to events.
+        Args:
+          pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal
+          offsets. E.g.,
+            [[0.1800, 0.9669],
+             [1.1400, 2.6458],
+             ...]
+        Returns:
+          pedal_events: list of dict, e.g.,
+            [{'onset_time': 0.1800, 'offset_time': 0.9669},
+             {'onset_time': 1.1400, 'offset_time': 2.6458},
+             ...]
+        """
+        pedal_events = []
+        for i in range(len(pedal_on_offs)):
+            pedal_events.append({
+                'onset_time': pedal_on_offs[i, 0],
+                'offset_time': pedal_on_offs[i, 1]})
+        return pedal_events
+def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None,
+    dtype=np.float32, res_type='kaiser_best',
+    backends=[audioread.ffdec.FFmpegAudioFile]):
+    """Load audio. Copied from librosa.core.load() except that ffmpeg backend is
+    always used in this function."""
+    y = []
+    with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file:
+        sr_native = input_file.samplerate
+        n_channels = input_file.channels
+        s_start = int(np.round(sr_native * offset)) * n_channels
+        if duration is None:
+            s_end = np.inf
+        else:
+            s_end = s_start + (int(np.round(sr_native * duration))
+                               * n_channels)
+        n = 0
+        for frame in input_file:
+            frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype)
+            n_prev = n
+            n = n + len(frame)
+            if n < s_start:
+                # offset is after the current frame
+                # keep reading
+                continue
+            if s_end < n_prev:
+                # we're off the end.  stop reading
+                break
+            if s_end < n:
+                # the end is in this frame.  crop.
+                frame = frame[:s_end - n_prev]
+            if n_prev <= s_start <= n:
+                # beginning is in this frame
+                frame = frame[(s_start - n_prev):]
+            # tack on the current frame
+            y.append(frame)
+    if y:
+        y = np.concatenate(y)
+        if n_channels > 1:
+            y = y.reshape((-1, n_channels)).T
+            if mono:
+                y = librosa.to_mono(y)
+        if sr is not None:
+            y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type)
+        else:
+            sr = sr_native
+    # Final cleanup for dtype and contiguity
+    y = np.ascontiguousarray(y, dtype=dtype)
+    return (y, sr)