teticio's picture
refactor and improve apps
d533c9c
raw
history blame
1.72 kB
import numpy as np
from PIL import Image
from torch import cuda
from diffusers import DDPMPipeline
from librosa.beat import beat_track
from .mel import Mel
VERSION = "1.0.1"
class AudioDiffusion:
def __init__(self,
model_id="teticio/audio-diffusion-256",
resolution=256,
cuda=cuda.is_available()):
"""Class for generating audio using Denoising Diffusion Probabilistic Models.
Args:
model_id (String): name of model (local directory or Hugging Face Hub)
resolution (int): size of square mel spectrogram in pixels
cuda (bool): use CUDA?
"""
self.mel = Mel(x_res=resolution, y_res=resolution)
self.model_id = model_id
self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
if cuda:
self.ddpm.to("cuda")
def generate_spectrogram_and_audio(self):
"""Generate random mel spectrogram and convert to audio.
Returns:
PIL Image: mel spectrogram
(float, array): sample rate and raw audio
"""
images = self.ddpm(output_type="numpy")["sample"]
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
image = Image.fromarray(images[0][0])
audio = self.mel.image_to_audio(image)
return image, (self.mel.get_sample_rate(), audio)
@staticmethod
def loop_it(audio, sample_rate, loops=12):
tempo, beats = beat_track(y=audio, sr=sample_rate, units='samples')
if len(beats) > 8:
return np.tile(audio[beats[0]:beats[8]], loops)
if len(beats) > 4:
return np.tile(audio[beats[0]:beats[4]], loops)
return None