Spaces:

TencentARC
/

MasaCtrl

Sleeping

File size: 6,857 Bytes

import gradio as gr
import numpy as np
import torch
from diffusers import DDIMScheduler
from pytorch_lightning import seed_everything

from masactrl.diffuser_utils import MasaCtrlPipeline
from masactrl.masactrl_utils import (AttentionBase,
                                     regiter_attention_editor_diffusers)

from .app_utils import global_context

torch.set_grad_enabled(False)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
#     "cpu")
# model_path = "andite/anything-v4.0"
# scheduler = DDIMScheduler(beta_start=0.00085,
#                           beta_end=0.012,
#                           beta_schedule="scaled_linear",
#                           clip_sample=False,
#                           set_alpha_to_one=False)
# model = MasaCtrlPipeline.from_pretrained(model_path,
#                                          scheduler=scheduler).to(device)


def consistent_synthesis(source_prompt, target_prompt, starting_step,
                         starting_layer, image_resolution, ddim_steps, scale,
                         seed, appended_prompt, negative_prompt):
    from masactrl.masactrl import MutualSelfAttentionControl

    model = global_context["model"]
    device = global_context["device"]

    seed_everything(seed)

    with torch.no_grad():
        if appended_prompt is not None:
            source_prompt += appended_prompt
            target_prompt += appended_prompt
        prompts = [source_prompt, target_prompt]

        # initialize the noise map
        start_code = torch.randn([1, 4, 64, 64], device=device)
        start_code = start_code.expand(len(prompts), -1, -1, -1)

        # inference the synthesized image without MasaCtrl
        editor = AttentionBase()
        regiter_attention_editor_diffusers(model, editor)
        target_image_ori = model([target_prompt],
                                 latents=start_code[-1:],
                                 guidance_scale=7.5)
        target_image_ori = target_image_ori.cpu().permute(0, 2, 3, 1).numpy()

        # inference the synthesized image with MasaCtrl
        # hijack the attention module
        controller = MutualSelfAttentionControl(starting_step, starting_layer)
        regiter_attention_editor_diffusers(model, controller)

        # inference the synthesized image
        image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5)
        image_masactrl = image_masactrl.cpu().permute(0, 2, 3, 1).numpy()

    return [image_masactrl[0], target_image_ori[0],
            image_masactrl[1]]  # source, fixed seed, masactrl


def create_demo_synthesis():
    with gr.Blocks() as demo:
        gr.Markdown("## **Input Settings**")
        with gr.Row():
            with gr.Column():
                source_prompt = gr.Textbox(
                    label="Source Prompt",
                    value='1boy, casual, outdoors, sitting',
                    interactive=True)
                target_prompt = gr.Textbox(
                    label="Target Prompt",
                    value='1boy, casual, outdoors, standing',
                    interactive=True)
                with gr.Row():
                    ddim_steps = gr.Slider(label="DDIM Steps",
                                            minimum=1,
                                            maximum=999,
                                            value=50,
                                            step=1)
                    starting_step = gr.Slider(
                        label="Step of MasaCtrl",
                        minimum=0,
                        maximum=999,
                        value=4,
                        step=1)
                    starting_layer = gr.Slider(label="Layer of MasaCtrl",
                                                minimum=0,
                                                maximum=16,
                                                value=10,
                                                step=1)
                run_btn = gr.Button("Run")
            with gr.Column():
                appended_prompt = gr.Textbox(label="Appended Prompt", value='')
                negative_prompt = gr.Textbox(label="Negative Prompt", value='')
                with gr.Row():
                    image_resolution = gr.Slider(label="Image Resolution",
                                                minimum=256,
                                                maximum=768,
                                                value=512,
                                                step=64)
                    scale = gr.Slider(label="CFG Scale",
                                    minimum=0.1,
                                    maximum=30.0,
                                    value=7.5,
                                    step=0.1)
                    seed = gr.Slider(label="Seed",
                                    minimum=-1,
                                    maximum=2147483647,
                                    value=42,
                                    step=1)

        gr.Markdown("## **Output**")
        with gr.Row():
            image_source = gr.Image(label="Source Image")
            image_fixed = gr.Image(label="Image with Fixed Seed")
            image_masactrl = gr.Image(label="Image with MasaCtrl")

        inputs = [
            source_prompt, target_prompt, starting_step, starting_layer,
            image_resolution, ddim_steps, scale, seed, appended_prompt,
            negative_prompt
        ]
        run_btn.click(consistent_synthesis, inputs,
                        [image_source, image_fixed, image_masactrl])

        gr.Examples(
            [[
                "1boy, bishounen, casual, indoors, sitting, coffee shop, bokeh",
                "1boy, bishounen, casual, indoors, standing, coffee shop, bokeh",
                42
            ],
                [
                    "1boy, casual, outdoors, sitting",
                    "1boy, casual, outdoors, sitting, side view", 42
                ],
                [
                    "1boy, casual, outdoors, sitting",
                    "1boy, casual, outdoors, standing, clapping hands", 42
                ],
                [
                    "1boy, casual, outdoors, sitting",
                    "1boy, casual, outdoors, sitting, shows thumbs up", 42
                ],
                [
                    "1boy, casual, outdoors, sitting",
                    "1boy, casual, outdoors, sitting, with crossed arms", 42
                ],
                [
                    "1boy, casual, outdoors, sitting",
                    "1boy, casual, outdoors, sitting, rasing hands", 42
                ]],
            [source_prompt, target_prompt, seed],
        )
    return demo


if __name__ == "__main__":
    demo_syntehsis = create_demo_synthesis()
    demo_synthesis.launch()