pivaenist / audio.py
TomRB22's picture
Set seconds parameter of generate_and_display to -1
bd1f4bf
raw
history blame
5.11 kB
# Machine learning, flow and data
import tensorflow as tf
import numpy as np
import pandas as pd
# Audio
import pretty_midi
# Displaying
from IPython import display
# Get the absolute path of the directory and add it to sys.path in order to
# get the VAE class type
import sys
from pathlib import Path
directory = Path(__file__).resolve().parent
sys.path.insert(0, str(directory))
from model import VAE
# Extras
import collections
_CAP = 3501 # Cap for the number of notes
_SAMPLING_RATE = 16000 # Parameter to pass continuous signal to a discrete one
_INSTRUMENT_NAME = "Acoustic Grand Piano" # MIDI instrument used
_SCALING_FACTORS = pd.Series(
{"pitch": 64.024558, "step": 0.101410, "duration": 0.199386}
) # Factors used to normalize song maps
def midi_to_notes(midi_file: str) -> pd.DataFrame:
"""
Convert midi file to "song map" (dataframe where each note is broken
into its components)
Parameters:
midi_file (str): Path to the midi file.
Returns:
pd.Dataframe: 3xN matrix where each column is a note, composed of
pitch, duration and step.
"""
pm = pretty_midi.PrettyMIDI(midi_file)
instrument = pm.instruments[0]
notes = collections.defaultdict(list)
# Sort the notes by start time
sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
prev_start = sorted_notes[0].start
# Separate each individual note in pitch, step and duration
for note in sorted_notes:
start = note.start
end = note.end
notes['pitch'].append(note.pitch)
notes['step'].append(start - prev_start)
notes['duration'].append(end - start)
prev_start = start
# Put notes in a dataframe
notes_df = pd.DataFrame({name: np.array(value) for name, value in notes.items()})
notes_df = notes_df[:_CAP] # Cap the song to match the model's architecture
return notes_df / _SCALING_FACTORS # Scale
def display_audio(pm: pretty_midi.PrettyMIDI, seconds=-1) -> display.Audio:
"""
Display a song in PrettyMIDI format as a display.Audio object.
This method specially comes in useful in a jupyter notebook.
Parameters:
pm (str): PrettyMidi object containing a song.
seconds (int): Time fraction of the song to be displayed. When
set to -1, the full length is taken.
Returns:
display.Audio: Song as an object allowing for display.
"""
waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
# Take a sample of the generated waveform to mitigate kernel resets
if seconds == -1:
waveform_short = waveform[:]
else:
waveform_short = waveform[:seconds*_SAMPLING_RATE]
return display.Audio(waveform_short, rate=_SAMPLING_RATE)
def map_to_wav(song_map: pd.DataFrame, out_file: str, velocity: int=50) -> pretty_midi.PrettyMIDI:
"""
Convert "song map" to midi file (reverse process with respect to
midi_to_notes) and (optionally) save it, generating a PrettyMidi object in the process.
Parameters:
song_map (pd.DataFrame): 3xN matrix where each column is a note, composed of
pitch, duration and step.
out_file (str): Path or file to write .mid file to. If None, no saving is done.
velocity: Note loudness, i. e. the hardness a piano key is struck with.
Returns:
pretty_midi.PrettyMIDI: PrettyMIDI object containing the song's representation.
"""
contracted_map = tf.squeeze(song_map)
song_map_T = contracted_map.numpy().T
notes = pd.DataFrame(song_map_T, columns=["pitch", "step", "duration"]).mul(_SCALING_FACTORS, axis=1)
notes["pitch"] = notes["pitch"].astype('int32').clip(1, 127)
pm = pretty_midi.PrettyMIDI()
instrument = pretty_midi.Instrument(
program=pretty_midi.instrument_name_to_program(
_INSTRUMENT_NAME))
prev_start = 0
for i, note in notes.iterrows():
start = float(prev_start + note['step'])
end = float(start + note['duration'])
note = pretty_midi.Note(
velocity=velocity,
pitch=int(note['pitch']),
start=start,
end=end,
)
instrument.notes.append(note)
prev_start = start
pm.instruments.append(instrument)
if (out_file):
pm.write(out_file)
return pm
def generate_and_display(model: VAE,
out_file: str=None,
z_sample: tf.Tensor=None,
velocity: int=50,
seconds: int=-1) -> display.Audio:
"""
Generate a song, (optionally) save it and display it.
Parameters:
model (VAE): Instance of VAE to generate the song with.
out_file (str): Path or file to write .mid file to. If None, no saving is done.
z_sample (tf.Tensor): Song encoding used to generate a song. If None, perform
generate an unconditioned piece.
velocity: Note loudness, i. e. the hardness a piano key is struck with.
seconds (int): Time fraction of the song to be displayed. When
set to -1, the full length is taken.
Returns:
display.Audio: Song as an object allowing for display.
"""
song_map = model.generate(z_sample)
wav = map_to_wav(song_map, out_file, velocity)
return display_audio(wav, seconds)