asigalov61's picture
Update utilities.py
208cf4e verified
import os
import numpy as np
import audioread
import librosa
from mido import MidiFile
from piano_vad import (note_detection_with_onset_offset_regress,
pedal_detection_with_onset_offset_regress)
import config
def create_folder(fd):
if not os.path.exists(fd):
os.makedirs(fd)
def get_filename(path):
path = os.path.realpath(path)
na_ext = path.split('/')[-1]
na = os.path.splitext(na_ext)[0]
return na
def note_to_freq(piano_note):
return 2 ** ((piano_note - 39) / 12) * 440
def float32_to_int16(x):
assert np.max(np.abs(x)) <= 1.
return (x * 32767.).astype(np.int16)
def int16_to_float32(x):
return (x / 32767.).astype(np.float32)
def pad_truncate_sequence(x, max_len):
if len(x) < max_len:
return np.concatenate((x, np.zeros(max_len - len(x))))
else:
return x[0 : max_len]
def read_midi(midi_path):
"""Parse MIDI file.
Args:
midi_path: str
Returns:
midi_dict: dict, e.g. {
'midi_event': [
'program_change channel=0 program=0 time=0',
'control_change channel=0 control=64 value=127 time=0',
'control_change channel=0 control=64 value=63 time=236',
...],
'midi_event_time': [0., 0, 0.98307292, ...]}
"""
midi_file = MidiFile(midi_path)
ticks_per_beat = midi_file.ticks_per_beat
assert len(midi_file.tracks) == 2
"""The first track contains tempo, time signature. The second track
contains piano events."""
microseconds_per_beat = midi_file.tracks[0][0].tempo
beats_per_second = 1e6 / microseconds_per_beat
ticks_per_second = ticks_per_beat * beats_per_second
message_list = []
ticks = 0
time_in_second = []
for message in midi_file.tracks[1]:
message_list.append(str(message))
ticks += message.time
time_in_second.append(ticks / ticks_per_second)
midi_dict = {
'midi_event': np.array(message_list),
'midi_event_time': np.array(time_in_second)}
return midi_dict
def write_events_to_midi(start_time, note_events, pedal_events, midi_path):
"""Write out note events to MIDI file.
Args:
start_time: float
note_events: list of dict, e.g. [
{'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44},
{'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50}
...]
midi_path: str
"""
from mido import Message, MidiFile, MidiTrack, MetaMessage
# This configuration is the same as MIDIs in MAESTRO dataset
ticks_per_beat = 384
beats_per_second = 2
ticks_per_second = ticks_per_beat * beats_per_second
microseconds_per_beat = int(1e6 // beats_per_second)
midi_file = MidiFile()
midi_file.ticks_per_beat = ticks_per_beat
# Track 0
track0 = MidiTrack()
track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0))
track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0))
track0.append(MetaMessage('end_of_track', time=1))
midi_file.tracks.append(track0)
# Track 1
track1 = MidiTrack()
# Message rolls of MIDI
message_roll = []
for note_event in note_events:
# Onset
message_roll.append({
'time': note_event['onset_time'],
'midi_note': note_event['midi_note'],
'velocity': note_event['velocity']})
# Offset
message_roll.append({
'time': note_event['offset_time'],
'midi_note': note_event['midi_note'],
'velocity': 0})
if pedal_events:
for pedal_event in pedal_events:
message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127})
message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0})
# Sort MIDI messages by time
message_roll.sort(key=lambda note_event: note_event['time'])
previous_ticks = 0
for message in message_roll:
this_ticks = int((message['time'] - start_time) * ticks_per_second)
if this_ticks >= 0:
diff_ticks = this_ticks - previous_ticks
previous_ticks = this_ticks
if 'midi_note' in message.keys():
track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks))
elif 'control_change' in message.keys():
track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks))
track1.append(MetaMessage('end_of_track', time=1))
midi_file.tracks.append(track1)
midi_file.save(midi_path)
class RegressionPostProcessor(object):
def __init__(self, frames_per_second, classes_num, onset_threshold,
offset_threshold, frame_threshold, pedal_offset_threshold):
"""Postprocess the output probabilities of a transription model to MIDI
events.
Args:
frames_per_second: int
classes_num: int
onset_threshold: float
offset_threshold: float
frame_threshold: float
pedal_offset_threshold: float
"""
self.frames_per_second = frames_per_second
self.classes_num = classes_num
self.onset_threshold = onset_threshold
self.offset_threshold = offset_threshold
self.frame_threshold = frame_threshold
self.pedal_offset_threshold = pedal_offset_threshold
self.begin_note = config.begin_note
self.velocity_scale = config.velocity_scale
def output_dict_to_midi_events(self, output_dict):
"""Main function. Post process model outputs to MIDI events.
Args:
output_dict: {
'reg_onset_output': (segment_frames, classes_num),
'reg_offset_output': (segment_frames, classes_num),
'frame_output': (segment_frames, classes_num),
'velocity_output': (segment_frames, classes_num),
'reg_pedal_onset_output': (segment_frames, 1),
'reg_pedal_offset_output': (segment_frames, 1),
'pedal_frame_output': (segment_frames, 1)}
Outputs:
est_note_events: list of dict, e.g. [
{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]
est_pedal_events: list of dict, e.g. [
{'onset_time': 0.17, 'offset_time': 0.96},
{'osnet_time': 1.17, 'offset_time': 2.65}]
"""
# Post process piano note outputs to piano note and pedal events information
(est_on_off_note_vels, est_pedal_on_offs) = \
self.output_dict_to_note_pedal_arrays(output_dict)
"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""
# Reformat notes to MIDI events
est_note_events = self.detected_notes_to_events(est_on_off_note_vels)
if est_pedal_on_offs is None:
est_pedal_events = None
else:
est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)
return est_note_events, est_pedal_events
def output_dict_to_note_pedal_arrays(self, output_dict):
"""Postprocess the output probabilities of a transription model to MIDI
events.
Args:
output_dict: dict, {
'reg_onset_output': (frames_num, classes_num),
'reg_offset_output': (frames_num, classes_num),
'frame_output': (frames_num, classes_num),
'velocity_output': (frames_num, classes_num),
...}
Returns:
est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
offset_time, piano_note and velocity. E.g. [
[39.74, 39.87, 27, 0.65],
[11.98, 12.11, 33, 0.69],
...]
est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
and offset_time. E.g. [
[0.17, 0.96],
[1.17, 2.65],
...]
"""
# ------ 1. Process regression outputs to binarized outputs ------
# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]
# Calculate binarized onset output from regression output
(onset_output, onset_shift_output) = \
self.get_binarized_output_from_regression(
reg_output=output_dict['reg_onset_output'],
threshold=self.onset_threshold, neighbour=2)
output_dict['onset_output'] = onset_output # Values are 0 or 1
output_dict['onset_shift_output'] = onset_shift_output
# Calculate binarized offset output from regression output
(offset_output, offset_shift_output) = \
self.get_binarized_output_from_regression(
reg_output=output_dict['reg_offset_output'],
threshold=self.offset_threshold, neighbour=4)
output_dict['offset_output'] = offset_output # Values are 0 or 1
output_dict['offset_shift_output'] = offset_shift_output
if 'reg_pedal_onset_output' in output_dict.keys():
"""Pedal onsets are not used in inference. Instead, frame-wise pedal
predictions are used to detect onsets. We empirically found this is
more accurate to detect pedal onsets."""
pass
if 'reg_pedal_offset_output' in output_dict.keys():
# Calculate binarized pedal offset output from regression output
(pedal_offset_output, pedal_offset_shift_output) = \
self.get_binarized_output_from_regression(
reg_output=output_dict['reg_pedal_offset_output'],
threshold=self.pedal_offset_threshold, neighbour=4)
output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output
# ------ 2. Process matrices results to event results ------
# Detect piano notes from output_dict
est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)
if 'reg_pedal_onset_output' in output_dict.keys():
# Detect piano pedals from output_dict
est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict)
else:
est_pedal_on_offs = None
return est_on_off_note_vels, est_pedal_on_offs
def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
"""Calculate binarized output and shifts of onsets or offsets from the
regression results.
Args:
reg_output: (frames_num, classes_num)
threshold: float
neighbour: int
Returns:
binary_output: (frames_num, classes_num)
shift_output: (frames_num, classes_num)
"""
binary_output = np.zeros_like(reg_output)
shift_output = np.zeros_like(reg_output)
(frames_num, classes_num) = reg_output.shape
for k in range(classes_num):
x = reg_output[:, k]
for n in range(neighbour, frames_num - neighbour):
if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
binary_output[n, k] = 1
"""See Section III-D in [1] for deduction.
[1] Q. Kong, et al., High-resolution Piano Transcription
with Pedals by Regressing Onsets and Offsets Times, 2020."""
if x[n - 1] > x[n + 1]:
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
else:
shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
shift_output[n, k] = shift
return binary_output, shift_output
def is_monotonic_neighbour(self, x, n, neighbour):
"""Detect if values are monotonic in both side of x[n].
Args:
x: (frames_num,)
n: int
neighbour: int
Returns:
monotonic: bool
"""
monotonic = True
for i in range(neighbour):
if x[n - i] < x[n - i - 1]:
monotonic = False
if x[n + i] < x[n + i + 1]:
monotonic = False
return monotonic
def output_dict_to_detected_notes(self, output_dict):
"""Postprocess output_dict to piano notes.
Args:
output_dict: dict, e.g. {
'onset_output': (frames_num, classes_num),
'onset_shift_output': (frames_num, classes_num),
'offset_output': (frames_num, classes_num),
'offset_shift_output': (frames_num, classes_num),
'frame_output': (frames_num, classes_num),
'onset_output': (frames_num, classes_num),
...}
Returns:
est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
MIDI notes and velocities. E.g.,
[[39.7375, 39.7500, 27., 0.6638],
[11.9824, 12.5000, 33., 0.6892],
...]
"""
est_tuples = []
est_midi_notes = []
classes_num = output_dict['frame_output'].shape[-1]
for piano_note in range(classes_num):
"""Detect piano notes"""
est_tuples_per_note = note_detection_with_onset_offset_regress(
frame_output=output_dict['frame_output'][:, piano_note],
onset_output=output_dict['onset_output'][:, piano_note],
onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
offset_output=output_dict['offset_output'][:, piano_note],
offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
velocity_output=output_dict['velocity_output'][:, piano_note],
frame_threshold=self.frame_threshold)
est_tuples += est_tuples_per_note
est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)
est_tuples = np.array(est_tuples) # (notes, 5)
"""(notes, 5), the five columns are onset, offset, onset_shift,
offset_shift and normalized_velocity"""
est_midi_notes = np.array(est_midi_notes) # (notes,)
if len(est_tuples) == 0:
return np.array([])
else:
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
velocities = est_tuples[:, 4]
est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
"""(notes, 3), the three columns are onset_times, offset_times and velocity."""
est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)
return est_on_off_note_vels
def output_dict_to_detected_pedals(self, output_dict):
"""Postprocess output_dict to piano pedals.
Args:
output_dict: dict, e.g. {
'pedal_frame_output': (frames_num,),
'pedal_offset_output': (frames_num,),
'pedal_offset_shift_output': (frames_num,),
...}
Returns:
est_on_off: (notes, 2), the two columns are pedal onsets and pedal
offsets. E.g.,
[[0.1800, 0.9669],
[1.1400, 2.6458],
...]
"""
frames_num = output_dict['pedal_frame_output'].shape[0]
est_tuples = pedal_detection_with_onset_offset_regress(
frame_output=output_dict['pedal_frame_output'][:, 0],
offset_output=output_dict['pedal_offset_output'][:, 0],
offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0],
frame_threshold=0.5)
est_tuples = np.array(est_tuples)
"""(notes, 2), the two columns are pedal onsets and pedal offsets"""
if len(est_tuples) == 0:
return np.array([])
else:
onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
est_on_off = np.stack((onset_times, offset_times), axis=-1)
est_on_off = est_on_off.astype(np.float32)
return est_on_off
def detected_notes_to_events(self, est_on_off_note_vels):
"""Reformat detected notes to midi events.
Args:
est_on_off_vels: (notes, 3), the three columns are onset_times,
offset_times and velocity. E.g.
[[32.8376, 35.7700, 0.7932],
[37.3712, 39.9300, 0.8058],
...]
Returns:
midi_events, list, e.g.,
[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
...]
"""
midi_events = []
for i in range(est_on_off_note_vels.shape[0]):
midi_events.append({
'onset_time': est_on_off_note_vels[i][0],
'offset_time': est_on_off_note_vels[i][1],
'midi_note': int(est_on_off_note_vels[i][2]),
'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})
return midi_events
def detected_pedals_to_events(self, pedal_on_offs):
"""Reformat detected pedal onset and offsets to events.
Args:
pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal
offsets. E.g.,
[[0.1800, 0.9669],
[1.1400, 2.6458],
...]
Returns:
pedal_events: list of dict, e.g.,
[{'onset_time': 0.1800, 'offset_time': 0.9669},
{'onset_time': 1.1400, 'offset_time': 2.6458},
...]
"""
pedal_events = []
for i in range(len(pedal_on_offs)):
pedal_events.append({
'onset_time': pedal_on_offs[i, 0],
'offset_time': pedal_on_offs[i, 1]})
return pedal_events
def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None,
dtype=np.float32, res_type='kaiser_best',
backends=[audioread.ffdec.FFmpegAudioFile]):
"""Load audio. Copied from librosa.core.load() except that ffmpeg backend is
always used in this function."""
y = []
with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file:
sr_native = input_file.samplerate
n_channels = input_file.channels
s_start = int(np.round(sr_native * offset)) * n_channels
if duration is None:
s_end = np.inf
else:
s_end = s_start + (int(np.round(sr_native * duration))
* n_channels)
n = 0
for frame in input_file:
frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype)
n_prev = n
n = n + len(frame)
if n < s_start:
# offset is after the current frame
# keep reading
continue
if s_end < n_prev:
# we're off the end. stop reading
break
if s_end < n:
# the end is in this frame. crop.
frame = frame[:s_end - n_prev]
if n_prev <= s_start <= n:
# beginning is in this frame
frame = frame[(s_start - n_prev):]
# tack on the current frame
y.append(frame)
if y:
y = np.concatenate(y)
if n_channels > 1:
y = y.reshape((-1, n_channels)).T
if mono:
y = librosa.to_mono(y)
if sr is not None:
y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type)
else:
sr = sr_native
# Final cleanup for dtype and contiguity
y = np.ascontiguousarray(y, dtype=dtype)
return (y, sr)