Spaces:
Configuration error
Configuration error
from diffusers import StableDiffusionImg2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline | |
import torch | |
from PIL import Image, ImageDraw | |
import os | |
import numpy as np | |
from scipy.io.wavfile import read | |
import gradio as gr | |
os.system('git clone https://github.com/hmartiro/riffusion-inference.git riffusion') | |
from riffusion.riffusion.riffusion_pipeline import RiffusionPipeline | |
from riffusion.riffusion.datatypes import PromptInput, InferenceInput | |
from riffusion.riffusion.audio import wav_bytes_from_spectrogram_image | |
from PIL import Image | |
import struct | |
import random | |
repo_id = "riffusion/riffusion-model-v1" | |
model = RiffusionPipeline.from_pretrained( | |
repo_id, | |
revision="main", | |
torch_dtype=torch.float16, | |
safety_checker=lambda images, **kwargs: (images, False), | |
) | |
if torch.cuda.is_available(): | |
model.to("cuda") | |
pipe_inpaint = StableDiffusionInpaintPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, safety_checker=lambda images, **kwargs: (images, False),) | |
pipe_inpaint.scheduler = DPMSolverMultistepScheduler.from_config(pipe_inpaint.scheduler.config) | |
# pipe_inpaint.enable_xformers_memory_efficient_attention() | |
if torch.cuda.is_available(): | |
pipe_inpaint = pipe_inpaint.to("cuda") | |
def get_init_image(image, overlap, feel): | |
width, height = image.size | |
init_image = Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB") | |
# Crop the right side of the original image with `overlap_width` | |
cropped_img = image.crop((width - int(width*overlap), 0, width, height)) | |
init_image.paste(cropped_img, (0, 0)) | |
return init_image | |
def get_mask(image, overlap): | |
width, height = image.size | |
mask = Image.new("RGB", (width, height), color="white") | |
draw = ImageDraw.Draw(mask) | |
draw.rectangle((0, 0, int(overlap * width), height), fill="black") | |
return mask | |
def i2i(prompt, steps, feel, seed): | |
# return pipe_i2i( | |
# prompt, | |
# num_inference_steps=steps, | |
# image=Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB"), | |
# ).images[0] | |
prompt_input_start = PromptInput(prompt=prompt, seed=seed) | |
prompt_input_end = PromptInput(prompt=prompt, seed=seed) | |
return model.riffuse( | |
inputs=InferenceInput( | |
start=prompt_input_start, | |
end=prompt_input_end, | |
alpha=1.0, | |
num_inference_steps=steps), | |
init_image=Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB") | |
) | |
def outpaint(prompt, init_image, mask, steps): | |
return pipe_inpaint( | |
prompt, | |
num_inference_steps=steps, | |
image=init_image, | |
mask_image=mask, | |
).images[0] | |
def generate(prompt, steps, num_iterations, feel, seed): | |
if seed == 0: | |
seed = random.randint(0,4294967295) | |
num_images = num_iterations | |
overlap = 0.5 | |
image_width, image_height = 512, 512 # dimensions of each output image | |
total_width = num_images * image_width - (num_images - 1) * int(overlap * image_width) # total width of the stitched image | |
# Create a blank image with the desired dimensions | |
stitched_image = Image.new("RGB", (total_width, image_height), color="white") | |
# Initialize the x position for pasting the next image | |
x_pos = 0 | |
image = i2i(prompt, steps, feel, seed) | |
for i in range(num_images): | |
# Generate the prompt, initial image, and mask for this iteration | |
init_image = get_init_image(image, overlap, feel) | |
mask = get_mask(init_image, overlap) | |
# Run the outpaint function to generate the output image | |
steps = 25 | |
image = outpaint(prompt, init_image, mask, steps) | |
# Paste the output image onto the stitched image | |
stitched_image.paste(image, (x_pos, 0)) | |
# Update the x position for the next iteration | |
x_pos += int((1 - overlap) * image_width) | |
wav_bytes, duration_s = wav_bytes_from_spectrogram_image(stitched_image) | |
mask = Image.new("RGB", (512, 512), color="white") | |
bg_image = outpaint(prompt, init_image, mask, steps) | |
bg_image.save("bg_image.png") | |
# return read(wav_bytes) | |
with open("output.wav", "wb") as f: | |
f.write(wav_bytes.read()) | |
return gr.make_waveform("output.wav", bg_image="bg_image.png", bar_count=int(duration_s*25)) | |
############################################### | |
def riffuse(steps, feel, init_image, prompt_start, seed_start, denoising_start=0.75, guidance_start=7.0, prompt_end=None, seed_end=None, denoising_end=0.75, guidance_end=7.0, alpha=0.5): | |
prompt_input_start = PromptInput(prompt=prompt_start, seed=seed_start, denoising=denoising_start, guidance=guidance_start) | |
prompt_input_end = PromptInput(prompt=prompt_end, seed=seed_end, denoising=denoising_end, guidance=guidance_end) | |
input = InferenceInput( | |
start=prompt_input_start, | |
end=prompt_input_end, | |
alpha=alpha, | |
num_inference_steps=steps, | |
seed_image_id=feel, | |
# mask_image_id="mask_beat_lines_80.png" | |
) | |
image = model.riffuse(inputs=input, init_image=init_image) | |
wav_bytes, duration_s = wav_bytes_from_spectrogram_image(image) | |
return wav_bytes, image | |
def generate_riffuse(prompt_start, steps, num_iterations, feel, prompt_end=None, seed_start=None, seed_end=None, denoising_start=0.75, denoising_end=0.75, guidance_start=7.0, guidance_end=7.0): | |
"""Generate a WAV file of length seconds using the Riffusion model. | |
Args: | |
length (int): Length of the WAV file in seconds, must be divisible by 5. | |
prompt_start (str): Prompt to start with. | |
prompt_end (str, optional): Prompt to end with. Defaults to prompt_start. | |
overlap (float, optional): Overlap between audio clips as a fraction of the image size. Defaults to 0.2. | |
""" | |
# open the initial image and convert it to RGB | |
init_image = Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB") | |
if prompt_end is None: | |
prompt_end = prompt_start | |
if seed_start is 0: | |
seed_start = random.randint(0,4294967295) | |
if seed_end is None: | |
seed_end = seed_start | |
# one riffuse() generates 5 seconds of audio | |
wav_list = [] | |
for i in range(int(num_iterations)): | |
alpha = i / (num_iterations - 1) | |
print(alpha) | |
wav_bytes, image = riffuse(steps, feel, init_image, prompt_start, seed_start, denoising_start, guidance_start, prompt_end, seed_end, denoising_end, guidance_end, alpha=alpha) | |
wav_list.append(wav_bytes) | |
init_image = image | |
seed_start = seed_end | |
seed_end = seed_start + 1 | |
# return read(wav_bytes) | |
# return wav_list_to_wav(wav_list) | |
mask = Image.new("RGB", (512, 512), color="white") | |
bg_image = outpaint(f"{prompt_start} and {prompt_end}", init_image, mask, steps) | |
bg_image.save("bg_image.png") | |
with open("output.wav", "wb") as f: | |
f.write(wav_list_to_wav(wav_list)) | |
return gr.make_waveform("output.wav", bg_image="bg_image.png") | |
def wav_list_to_wav(wav_list): | |
# remove headers from the WAV files | |
data = [wav.read()[44:] for wav in wav_list] | |
# concatenate the data | |
concatenated_data = b"".join(data) | |
# create a new RIFF header | |
channels = 1 | |
sample_rate = 44100 | |
bytes_per_second = channels * sample_rate | |
new_header = struct.pack("<4sI4s4sIHHIIHH4sI", b"RIFF", len(concatenated_data) + 44 - 8, b"WAVE", b"fmt ", 16, 1, channels, sample_rate, bytes_per_second, 2, 16, b"data", len(concatenated_data)) | |
# combine the header and data to create the final WAV file | |
final_wav = new_header + concatenated_data | |
return final_wav | |
############################################### | |
def on_submit(prompt_1, prompt_2, steps, num_iterations, feel, seed): | |
if prompt_1 == "": | |
return None, gr.update(value="First prompt is required.") | |
if prompt_2 == "": | |
return generate(prompt_1, steps, num_iterations, feel, seed), None | |
else: | |
return generate_riffuse(prompt_1, steps, num_iterations, feel, prompt_end=prompt_2, seed_start=seed), None | |
def on_num_iterations_change(n, prompt_2): | |
if n is None: | |
return gr.update(value="") | |
if prompt_2 != "": | |
total_length = 5 * n | |
else: | |
total_length = 2.5 + 2.5 * n | |
return gr.update(value=f"Total length: {total_length:.2f} seconds") | |
with gr.Blocks() as app: | |
gr.Markdown("## Riffusion") | |
gr.Markdown("""Generate audio using the [Riffusion](https://huggingface.co/riffusion/riffusion-model-v1) model.<br> | |
In single prompt mode you can generate up to ~1 minute of audio with smooth transitions between sections. (beta)<br> | |
Bi-prompt mode interpolates between two prompts. It can generate up to ~2 minutes of audio, but the transitions between sections are more abrupt.""") | |
with gr.Row(): | |
with gr.Group(): | |
with gr.Row(): | |
prompt_1 = gr.Textbox(lines=1, label="Start from", placeholder="Starting prompt") | |
prompt_2 = gr.Textbox(lines=1, label="End with (optional)", placeholder="Prompt to shift towards at the end") | |
with gr.Row(): | |
steps = gr.Slider(minimum=1, maximum=100, value=25, label="Steps per section") | |
num_iterations = gr.Slider(minimum=2, maximum=25, value=2, step=1, label="Number of sections") | |
with gr.Row(): | |
feel = gr.Dropdown(["og_beat", "agile", "vibes", "motorway", "marim"], value="og_beat", label="Feel") | |
seed = gr.Slider(minimum=0, maximum=4294967295, value=0, step=1, label="Seed (0 for random)") | |
info = gr.Markdown() | |
btn_generate = gr.Button(value="Generate") | |
with gr.Column(): | |
video = gr.Video() | |
inputs = [prompt_1, prompt_2, steps, num_iterations, feel, seed] | |
outputs = [video, info] | |
num_iterations.change(on_num_iterations_change, [num_iterations, prompt_2], [info]) | |
prompt_1.submit(on_submit, inputs, outputs) | |
prompt_2.submit(on_submit, inputs, outputs) | |
btn_generate.click(on_submit, inputs, outputs) | |
examples = gr.Examples( | |
examples=[ | |
["typing", "dance beat", "og_beat", 10], | |
["synthwave", "jazz", "agile", 10], | |
["rap battle freestyle", "", "og_beat", 10], | |
["techno club banger", "", "og_beat", 10], | |
["reggae dub beat", "sunset chill", "og_beat", 10], | |
["acoustic folk ballad", "", "agile", 10], | |
["blues guitar riff", "", "agile", 5], | |
["jazzy trumpet solo", "", "og_beat", 5], | |
["classical symphony orchestra", "", "vibes", 10], | |
["rock and roll power chord", "", "motorway", 5], | |
["soulful R&B love song", "", "marim", 10], | |
["country western twangy guitar", "", "agile", 10]], | |
inputs=[prompt_1, prompt_2, feel, num_iterations]) | |
app.launch() | |