Spaces:

anzorq
/

riffusion-demo

Configuration error

App Files Files Community

anzorq commited on Dec 17, 2022

Commit

3df612c

•

1 Parent(s): 3523177

app v2

Browse files

Files changed (1) hide show

app.py +266 -25

app.py CHANGED Viewed

@@ -1,45 +1,286 @@
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 import torch
-import gradio as gr
 import os
 import numpy as np
 from scipy.io.wavfile import read
 os.system('git clone https://github.com/hmartiro/riffusion-inference.git riffusion')
 repo_id = "riffusion/riffusion-model-v1"
-pipe = DiffusionPipeline.from_pretrained(repo_id)
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 if torch.cuda.is_available():
-  pipe = pipe.to("cuda")
-def infer(prompt, steps):
-    from riffusion.riffusion import audio
-    mel_spectr = pipe(prompt, num_inference_steps=steps).images[0]
-    wav_bytes, duration_s = audio.wav_bytes_from_spectrogram_image(mel_spectr)
-    return read(wav_bytes)
 with gr.Blocks() as app:
-  with gr.Row():
-    with gr.Column():
-      prompt = gr.Textbox(lines=1, label="Prompt")
-      steps = gr.Slider(minimum=1, maximum=100, value=25, label="Steps")
-      btn_generate = gr.Button(value="Generate")
-    with gr.Column():
-      audio = gr.Audio(label="Audio")
-  inputs = [prompt, steps]
-  outputs = [audio]
-  prompt.submit(infer, inputs, outputs)
-  btn_generate.click(infer, inputs, outputs)
-  examples = gr.Examples(
-    examples=[["rap battle freestyle"], ["techno club banger"], ["acoustic folk ballad"], ["blues guitar riff"], ["jazzy trumpet solo"], ["classical symphony orchestra"], ["rock and roll power chord"], ["soulful R&B love song"], ["reggae dub beat"], ["country western twangy guitar"], ["all 25 steps"]],
-    inputs=[prompt])
-app.launch()

+from diffusers import StableDiffusionImg2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline
 import torch
+from PIL import Image, ImageDraw
 import os
 import numpy as np
 from scipy.io.wavfile import read
+import gradio as gr
 os.system('git clone https://github.com/hmartiro/riffusion-inference.git riffusion')
+from riffusion.riffusion.riffusion_pipeline import RiffusionPipeline
+from riffusion.riffusion.datatypes import PromptInput, InferenceInput
+from riffusion.riffusion.audio import wav_bytes_from_spectrogram_image
+from PIL import Image
+import struct
+import random
 repo_id = "riffusion/riffusion-model-v1"
+model = RiffusionPipeline.from_pretrained(
+      repo_id,
+      revision="main",
+      torch_dtype=torch.float16,
+      safety_checker=lambda images, **kwargs: (images, False),
+  )
+if torch.cuda.is_available():
+  model.to("cuda")
+pipe_inpaint = StableDiffusionInpaintPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, safety_checker=lambda images, **kwargs: (images, False),)
+pipe_inpaint.scheduler = DPMSolverMultistepScheduler.from_config(pipe_inpaint.scheduler.config)
+# pipe_inpaint.enable_xformers_memory_efficient_attention()
 if torch.cuda.is_available():
+    pipe_inpaint = pipe_inpaint.to("cuda")
+def get_init_image(image, overlap, feel):
+    width, height = image.size
+    init_image = Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB")
+    # Crop the right side of the original image with `overlap_width`
+    cropped_img = image.crop((width - int(width*overlap), 0, width, height))
+    init_image.paste(cropped_img, (0, 0))
+    return init_image
+def get_mask(image, overlap):
+    width, height = image.size
+    mask = Image.new("RGB", (width, height), color="white")
+    draw = ImageDraw.Draw(mask)
+    draw.rectangle((0, 0, int(overlap * width), height), fill="black")
+    return mask
+def i2i(prompt, steps, feel, seed):
+#   return pipe_i2i(
+#       prompt,
+#       num_inference_steps=steps,
+#       image=Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB"),
+#       ).images[0]
+    prompt_input_start = PromptInput(prompt=prompt, seed=seed)
+    prompt_input_end = PromptInput(prompt=prompt, seed=seed)
+    return model.riffuse(
+        inputs=InferenceInput(
+            start=prompt_input_start,
+            end=prompt_input_end,
+            alpha=1.0,
+            num_inference_steps=steps),
+        init_image=Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB")
+    )
+def outpaint(prompt, init_image, mask, steps):
+  return pipe_inpaint(
+      prompt,
+      num_inference_steps=steps,
+      image=init_image,
+      mask_image=mask,
+      ).images[0]
+def generate(prompt, steps, num_iterations, feel, seed):
+    if seed == 0:
+        seed = random.randint(0,4294967295)
+    num_images = num_iterations
+    overlap = 0.5
+    image_width, image_height = 512, 512  # dimensions of each output image
+    total_width = num_images * image_width - (num_images - 1) * int(overlap * image_width)  # total width of the stitched image
+    # Create a blank image with the desired dimensions
+    stitched_image = Image.new("RGB", (total_width, image_height), color="white")
+    # Initialize the x position for pasting the next image
+    x_pos = 0
+    image = i2i(prompt, steps, feel, seed)
+    for i in range(num_images):
+        # Generate the prompt, initial image, and mask for this iteration
+        init_image = get_init_image(image, overlap, feel)
+        mask = get_mask(init_image, overlap)
+        # Run the outpaint function to generate the output image
+        steps = 25
+        image = outpaint(prompt, init_image, mask, steps)
+        # Paste the output image onto the stitched image
+        stitched_image.paste(image, (x_pos, 0))
+        # Update the x position for the next iteration
+        x_pos += int((1 - overlap) * image_width)
+    wav_bytes, duration_s = wav_bytes_from_spectrogram_image(stitched_image)
+    mask = Image.new("RGB", (512, 512), color="white")
+    bg_image = outpaint(prompt, init_image, mask, steps)
+    bg_image.save("bg_image.png")
+    # return read(wav_bytes)
+    with open("output.wav", "wb") as f:
+        f.write(wav_bytes.read())
+    return gr.make_waveform("output.wav", bg_image="bg_image.png", bar_count=int(duration_s*25))
+###############################################
+def riffuse(steps, feel, init_image, prompt_start, seed_start, denoising_start=0.75, guidance_start=7.0, prompt_end=None, seed_end=None, denoising_end=0.75, guidance_end=7.0, alpha=0.5):
+  prompt_input_start = PromptInput(prompt=prompt_start, seed=seed_start, denoising=denoising_start, guidance=guidance_start)
+  prompt_input_end = PromptInput(prompt=prompt_end, seed=seed_end, denoising=denoising_end, guidance=guidance_end)
+  input = InferenceInput(
+      start=prompt_input_start,
+      end=prompt_input_end,
+      alpha=alpha,
+      num_inference_steps=steps,
+      seed_image_id=feel,
+      # mask_image_id="mask_beat_lines_80.png"
+  )
+  image = model.riffuse(inputs=input, init_image=init_image)
+  wav_bytes, duration_s = wav_bytes_from_spectrogram_image(image)
+  return wav_bytes, image
+def generate_riffuse(prompt_start, steps, num_iterations, feel, prompt_end=None, seed_start=None, seed_end=None, denoising_start=0.75, denoising_end=0.75, guidance_start=7.0, guidance_end=7.0):
+    """Generate a WAV file of length seconds using the Riffusion model.
+    Args:
+        length (int): Length of the WAV file in seconds, must be divisible by 5.
+        prompt_start (str): Prompt to start with.
+        prompt_end (str, optional): Prompt to end with. Defaults to prompt_start.
+        overlap (float, optional): Overlap between audio clips as a fraction of the image size. Defaults to 0.2.
+        """
+    # open the initial image and convert it to RGB
+    init_image = Image.open(f"riffusion/seed_images/{feel}.png").convert("RGB")
+    if prompt_end is None:
+      prompt_end = prompt_start
+    if seed_start is None:
+      seed_start = random.randint(0,4294967295)
+    if seed_end is None:
+      seed_end = seed_start
+    # one riffuse() generates 5 seconds of audio
+    wav_list = []
+    for i in range(int(num_iterations)):
+        alpha = i / (num_iterations - 1)
+        print(alpha)
+        wav_bytes, image = riffuse(steps, feel, init_image, prompt_start, seed_start, denoising_start, guidance_start, prompt_end, seed_end, denoising_end, guidance_end, alpha=alpha)
+        wav_list.append(wav_bytes)
+        init_image = image
+        seed_start = seed_end
+        seed_end = seed_start + 1
+    # return read(wav_bytes)
+    mask = Image.new("RGB", (512, 512), color="white")
+    bg_image = outpaint(f"{prompt_start} and {prompt_end}", init_image, mask, steps)
+    bg_image.save("bg_image.png")
+    with open("output.wav", "wb") as f:
+        f.write(wav_bytes.read())
+    return gr.make_waveform("output.wav", bg_image="bg_image.png")
+def wav_list_to_wav(wav_list):
+  # remove headers from the WAV files
+  data = [wav.read()[44:] for wav in wav_list]
+  # concatenate the data
+  concatenated_data = b"".join(data)
+  # create a new RIFF header
+  channels = 1
+  sample_rate = 44100
+  bytes_per_second = channels * sample_rate
+  new_header = struct.pack("<4sI4s4sIHHIIHH4sI", b"RIFF", len(concatenated_data) + 44 - 8, b"WAVE", b"fmt ", 16, 1, channels, sample_rate, bytes_per_second, 2, 16, b"data", len(concatenated_data))
+  # combine the header and data to create the final WAV file
+  final_wav = new_header + concatenated_data
+  return final_wav
+###############################################
+def on_submit(prompt_1, prompt_2, steps, num_iterations, feel, seed):
+    if prompt_1 == "":
+        return None, gr.update(value="First prompt is required.")
+    if prompt_2 == "":
+        return generate(prompt_1, steps, num_iterations, feel, seed), None
+    else:
+        return generate_riffuse(prompt_1, steps, num_iterations, feel, prompt_end=prompt_2, seed_start=seed), None
+def on_num_iterations_change(n, prompt_2):
+    if n is None:
+        return gr.update(value="")
+    x = 5 if prompt_2 != "" else 2.5
+    total_length = x + x * n
+    return gr.update(value=f"Total length: {total_length:.2f} seconds")
 with gr.Blocks() as app:
+    gr.Markdown("## Riffusion")
+    gr.Markdown("""Generate audio using the [Riffusion](https://huggingface.co/riffusion/riffusion-model-v1) model.<br>
+                In single prompt mode you can generate up to ~1 minute of audio with smooth transitions between sections. (beta)<br>
+                Bi-prompt mode interpolates between two prompts. It can generate up to ~2 minutes of audio, but the transitions between sections are more abrupt.""")
+    with gr.Row():
+        with gr.Group():
+            with gr.Row():
+                prompt_1 = gr.Textbox(lines=1, label="Start from", placeholder="Starting prompt")
+                prompt_2 = gr.Textbox(lines=1, label="End with (optional)", placeholder="Prompt to shift towards at the end")
+            with gr.Row():
+                steps = gr.Slider(minimum=1, maximum=100, value=25, label="Steps per section")
+                num_iterations = gr.Slider(minimum=2, maximum=25, value=2, step=1, label="Number of sections")
+            with gr.Row():
+                feel = gr.Dropdown(["og_beat", "agile", "vibes", "motorway", "marim"], value="og_beat", label="Feel")
+                seed = gr.Slider(minimum=0, maximum=4294967295, value=0, step=1, label="Seed (0 for random)")
+            info = gr.Markdown()
+            btn_generate = gr.Button(value="Generate")
+        with gr.Column():
+            video = gr.Video()
+    inputs = [prompt_1, prompt_2, steps, num_iterations, feel, seed]
+    outputs = [video, info]
+    num_iterations.change(on_num_iterations_change, [num_iterations, prompt_2], [info])
+    prompt_1.submit(on_submit, inputs, outputs)
+    prompt_2.submit(on_submit, inputs, outputs)
+    btn_generate.click(on_submit, inputs, outputs)
+    examples = gr.Examples(
+        examples=[
+            ["typing", "dance beat", "og_beat", 10],
+            ["synthwave", "jazz", "agile", 10],
+            ["rap battle freestyle", "", "og_beat", 10],
+            ["techno club banger", "", "og_beat", 10],
+            ["acoustic folk ballad", "", "agile", 10],
+            ["blues guitar riff", "", "agile", 5],
+            ["jazzy trumpet solo", "", "og_beat", 5],
+            ["classical symphony orchestra", "", "vibes", 10],
+            ["rock and roll power chord", "", "motorway", 5],
+            ["soulful R&B love song", "", "marim", 10],
+            ["reggae dub beat", "sunset chill", "og_beat", 10],
+            ["country western twangy guitar", "", "agile", 10]],
+        inputs=[prompt_1, prompt_2, feel, num_iterations])
+app.launch()