Spaces:

TencentARC
/

MasaCtrl

Running on A10G

App Files Files Community

ljzycmd commited on Apr 27, 2023

Commit

5fc5efa

•

1 Parent(s): bfee1f8

Add hugging face space demo.

Browse files

Files changed (14) hide show

app.py +55 -0
gradio_app/app_utils.py +30 -0
gradio_app/image_synthesis_app.py +166 -0
gradio_app/images/corgi.jpg +0 -0
gradio_app/images/person.png +0 -0
gradio_app/real_image_editing_app.py +162 -0
masactrl/__init__.py +0 -0
masactrl/diffuser_utils.py +275 -0
masactrl/masactrl.py +280 -0
masactrl/masactrl_utils.py +212 -0
playground.ipynb +149 -0
playground_real.ipynb +188 -0
requirements.txt +3 -0
style.css +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DDIMScheduler
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+torch.set_grad_enabled(False)
+from gradio_app.image_synthesis_app import create_demo_synthesis
+from gradio_app.real_image_editing_app import create_demo_editing
+from gradio_app.app_utils import global_context
+TITLE = "# [MasaCtrl](https://ljzycmd.github.io/projects/MasaCtrl/)"
+DESCRIPTION = "<b>Gradio demo for MasaCtrl</b>: [[GitHub]](https://github.com/TencentARC/MasaCtrl), \
+                [[Paper]](https://arxiv.org/abs/2304.08465). \
+                If MasaCtrl is helpful, please help to ⭐ the [Github Repo](https://github.com/TencentARC/MasaCtrl) 😊 </p>"
+DESCRIPTION += '<p>For faster inference without waiting in queue, \
+                you may duplicate the space and upgrade to GPU in settings. </p>'
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(TITLE)
+    gr.Markdown(DESCRIPTION)
+    model_path_gr = gr.Dropdown(
+        ["andite/anything-v4.0",
+         "CompVis/stable-diffusion-v1-4",
+         "runwayml/stable-diffusion-v1-5"],
+        value="andite/anything-v4.0",
+        label="Model", info="Select the model to use!"
+    )
+    with gr.Tab("Consistent Synthesis"):
+        create_demo_synthesis()
+    with gr.Tab("Real Editing"):
+        create_demo_editing()
+    def reload_ckpt(model_path):
+        print("Reloading model from", model_path)
+        global_context["model"] = MasaCtrlPipeline.from_pretrained(
+            model_path, scheduler=global_context["scheduler"]).to(global_context["device"])
+    model_path_gr.select(
+        reload_ckpt,
+        [model_path_gr]
+    )
+if __name__ == "__main__":
+    demo.launch()

gradio_app/app_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DDIMScheduler
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+torch.set_grad_enabled(False)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+    "cpu")
+model_path = "andite/anything-v4.0"
+scheduler = DDIMScheduler(beta_start=0.00085,
+                          beta_end=0.012,
+                          beta_schedule="scaled_linear",
+                          clip_sample=False,
+                          set_alpha_to_one=False)
+model = MasaCtrlPipeline.from_pretrained(model_path,
+                                         scheduler=scheduler).to(device)
+global_context = {
+    "model_path": model_path,
+    "scheduler": scheduler,
+    "model": model,
+    "device": device
+}

gradio_app/image_synthesis_app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DDIMScheduler
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+from .app_utils import global_context
+torch.set_grad_enabled(False)
+# device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+#     "cpu")
+# model_path = "andite/anything-v4.0"
+# scheduler = DDIMScheduler(beta_start=0.00085,
+#                           beta_end=0.012,
+#                           beta_schedule="scaled_linear",
+#                           clip_sample=False,
+#                           set_alpha_to_one=False)
+# model = MasaCtrlPipeline.from_pretrained(model_path,
+#                                          scheduler=scheduler).to(device)
+def consistent_synthesis(source_prompt, target_prompt, starting_step,
+                         starting_layer, image_resolution, ddim_steps, scale,
+                         seed, appended_prompt, negative_prompt):
+    from masactrl.masactrl import MutualSelfAttentionControl
+    model = global_context["model"]
+    device = global_context["device"]
+    seed_everything(seed)
+    with torch.no_grad():
+        if appended_prompt is not None:
+            source_prompt += appended_prompt
+            target_prompt += appended_prompt
+        prompts = [source_prompt, target_prompt]
+        # initialize the noise map
+        start_code = torch.randn([1, 4, 64, 64], device=device)
+        start_code = start_code.expand(len(prompts), -1, -1, -1)
+        # inference the synthesized image without MasaCtrl
+        editor = AttentionBase()
+        regiter_attention_editor_diffusers(model, editor)
+        target_image_ori = model([target_prompt],
+                                 latents=start_code[-1:],
+                                 guidance_scale=7.5)
+        target_image_ori = target_image_ori.cpu().permute(0, 2, 3, 1).numpy()
+        # inference the synthesized image with MasaCtrl
+        # hijack the attention module
+        controller = MutualSelfAttentionControl(starting_step, starting_layer)
+        regiter_attention_editor_diffusers(model, controller)
+        # inference the synthesized image
+        image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5)
+        image_masactrl = image_masactrl.cpu().permute(0, 2, 3, 1).numpy()
+    return [image_masactrl[0], target_image_ori[0],
+            image_masactrl[1]]  # source, fixed seed, masactrl
+def create_demo_synthesis():
+    with gr.Blocks() as demo:
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                source_prompt = gr.Textbox(
+                    label="Source Prompt",
+                    value='1boy, casual, outdoors, sitting',
+                    interactive=True)
+                target_prompt = gr.Textbox(
+                    label="Target Prompt",
+                    value='1boy, casual, outdoors, standing',
+                    interactive=True)
+                with gr.Row():
+                    ddim_steps = gr.Slider(label="DDIM Steps",
+                                            minimum=1,
+                                            maximum=999,
+                                            value=50,
+                                            step=1)
+                    starting_step = gr.Slider(
+                        label="Step of MasaCtrl",
+                        minimum=0,
+                        maximum=999,
+                        value=4,
+                        step=1)
+                    starting_layer = gr.Slider(label="Layer of MasaCtrl",
+                                                minimum=0,
+                                                maximum=16,
+                                                value=10,
+                                                step=1)
+                run_btn = gr.Button(label="Run")
+            with gr.Column():
+                appended_prompt = gr.Textbox(label="Appended Prompt", value='')
+                negative_prompt = gr.Textbox(label="Negative Prompt", value='')
+                with gr.Row():
+                    image_resolution = gr.Slider(label="Image Resolution",
+                                                minimum=256,
+                                                maximum=768,
+                                                value=512,
+                                                step=64)
+                    scale = gr.Slider(label="CFG Scale",
+                                    minimum=0.1,
+                                    maximum=30.0,
+                                    value=7.5,
+                                    step=0.1)
+                    seed = gr.Slider(label="Seed",
+                                    minimum=-1,
+                                    maximum=2147483647,
+                                    value=42,
+                                    step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_source = gr.Image(label="Source Image")
+            image_fixed = gr.Image(label="Image with Fixed Seed")
+            image_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [
+            source_prompt, target_prompt, starting_step, starting_layer,
+            image_resolution, ddim_steps, scale, seed, appended_prompt,
+            negative_prompt
+        ]
+        run_btn.click(consistent_synthesis, inputs,
+                        [image_source, image_fixed, image_masactrl])
+        gr.Examples(
+            [[
+                "1boy, bishounen, casual, indoors, sitting, coffee shop, bokeh",
+                "1boy, bishounen, casual, indoors, standing, coffee shop, bokeh",
+                42
+            ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, side view", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, standing, clapping hands", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, shows thumbs up", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, with crossed arms", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, rasing hands", 42
+                ]],
+            [source_prompt, target_prompt, seed],
+        )
+    return demo
+if __name__ == "__main__":
+    demo_syntehsis = create_demo_synthesis()
+    demo_synthesis.launch()

gradio_app/images/corgi.jpg ADDED Viewed

gradio_app/images/person.png ADDED Viewed

gradio_app/real_image_editing_app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import numpy as np
+import gradio as gr
+import torch
+import torch.nn.functional as F
+from diffusers import DDIMScheduler
+from torchvision.io import read_image
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+from .app_utils import global_context
+torch.set_grad_enabled(False)
+# device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+#     "cpu")
+# model_path = "CompVis/stable-diffusion-v1-4"
+# scheduler = DDIMScheduler(beta_start=0.00085,
+#                           beta_end=0.012,
+#                           beta_schedule="scaled_linear",
+#                           clip_sample=False,
+#                           set_alpha_to_one=False)
+# model = MasaCtrlPipeline.from_pretrained(model_path,
+#                                          scheduler=scheduler).to(device)
+def load_image(image_path):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    image = read_image(image_path)
+    image = image[:3].unsqueeze_(0).float() / 127.5 - 1.  # [-1, 1]
+    image = F.interpolate(image, (512, 512))
+    image = image.to(device)
+def real_image_editing(source_image, target_prompt,
+                       starting_step, starting_layer, ddim_steps, scale, seed,
+                       appended_prompt, negative_prompt):
+    from masactrl.masactrl import MutualSelfAttentionControl
+    model = global_context["model"]
+    device = global_context["device"]
+    seed_everything(seed)
+    with torch.no_grad():
+        if appended_prompt is not None:
+            target_prompt += appended_prompt
+        ref_prompt = ""
+        prompts = [ref_prompt, target_prompt]
+        # invert the image into noise map
+        if isinstance(source_image, np.ndarray):
+            source_image = torch.from_numpy(source_image).to(device) / 127.5 - 1.
+            source_image = source_image.unsqueeze(0).permute(0, 3, 1, 2)
+            source_image = F.interpolate(source_image, (512, 512))
+        start_code, latents_list = model.invert(source_image,
+                                                ref_prompt,
+                                                guidance_scale=scale,
+                                                num_inference_steps=ddim_steps,
+                                                return_intermediates=True)
+        start_code = start_code.expand(len(prompts), -1, -1, -1)
+        # recontruct the image with inverted DDIM noise map
+        editor = AttentionBase()
+        regiter_attention_editor_diffusers(model, editor)
+        image_fixed = model([target_prompt],
+                            latents=start_code[-1:],
+                            num_inference_steps=ddim_steps,
+                            guidance_scale=scale)
+        image_fixed = image_fixed.cpu().permute(0, 2, 3, 1).numpy()
+        # inference the synthesized image with MasaCtrl
+        # hijack the attention module
+        controller = MutualSelfAttentionControl(starting_step, starting_layer)
+        regiter_attention_editor_diffusers(model, controller)
+        # inference the synthesized image
+        image_masactrl = model(prompts,
+                               latents=start_code,
+                               guidance_scale=scale)
+        image_masactrl = image_masactrl.cpu().permute(0, 2, 3, 1).numpy()
+    return [
+        image_masactrl[0],
+        image_fixed[0],
+        image_masactrl[1]
+    ]  # source, fixed seed, masactrl
+def create_demo_editing():
+    with gr.Blocks() as demo:
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                source_image = gr.Image(label="Source Image", value=os.path.join(os.path.dirname(__file__), "images/corgi.jpg"), interactive=True)
+                target_prompt = gr.Textbox(label="Target Prompt",
+                                        value='A photo of a running corgi',
+                                        interactive=True)
+                with gr.Row():
+                    ddim_steps = gr.Slider(label="DDIM Steps",
+                                        minimum=1,
+                                        maximum=999,
+                                        value=50,
+                                        step=1)
+                    starting_step = gr.Slider(label="Step of MasaCtrl",
+                                            minimum=0,
+                                            maximum=999,
+                                            value=4,
+                                            step=1)
+                    starting_layer = gr.Slider(label="Layer of MasaCtrl",
+                                            minimum=0,
+                                            maximum=16,
+                                            value=10,
+                                            step=1)
+                run_btn = gr.Button(label="Run")
+            with gr.Column():
+                appended_prompt = gr.Textbox(label="Appended Prompt", value='')
+                negative_prompt = gr.Textbox(label="Negative Prompt", value='')
+                with gr.Row():
+                    scale = gr.Slider(label="CFG Scale",
+                                    minimum=0.1,
+                                    maximum=30.0,
+                                    value=7.5,
+                                    step=0.1)
+                    seed = gr.Slider(label="Seed",
+                                    minimum=-1,
+                                    maximum=2147483647,
+                                    value=42,
+                                    step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_recons = gr.Image(label="Source Image")
+            image_fixed = gr.Image(label="Image with Fixed Seed")
+            image_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [
+            source_image, target_prompt, starting_step, starting_layer, ddim_steps,
+            scale, seed, appended_prompt, negative_prompt
+        ]
+        run_btn.click(real_image_editing, inputs,
+                    [image_recons, image_fixed, image_masactrl])
+        gr.Examples(
+            [[os.path.join(os.path.dirname(__file__), "images/corgi.jpg"),
+              "A photo of a running corgi"],
+            [os.path.join(os.path.dirname(__file__), "images/person.png"),
+             "A photo of a person, black t-shirt, raising hand"],
+            ],
+            [source_image, target_prompt]
+        )
+    return demo
+if __name__ == "__main__":
+    demo_editing = create_demo_editing()
+    demo_editing.launch()

masactrl/__init__.py ADDED Viewed

File without changes

masactrl/diffuser_utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+Util functions based on Diffuser framework.
+"""
+import os
+import torch
+import cv2
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from PIL import Image
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from diffusers import StableDiffusionPipeline
+from pytorch_lightning import seed_everything
+class MasaCtrlPipeline(StableDiffusionPipeline):
+    def next_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta=0.,
+        verbose=False
+    ):
+        """
+        Inverse sampling for DDIM Inversion
+        """
+        if verbose:
+            print("timestep: ", timestep)
+        next_step = timestep
+        timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999)
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output
+        x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir
+        return x_next, pred_x0
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta: float=0.0,
+        verbose=False,
+    ):
+        """
+        predict the sampe the next step in the denoise process.
+        """
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+        x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def image2latent(self, image):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if type(image) is Image:
+            image = np.array(image)
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(DEVICE)
+        # input image density range [-1, 1]
+        latents = self.vae.encode(image)['latent_dist'].mean
+        latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        elif return_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+    def latent2image_grad(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)['sample']
+        return image  # range [-1, 1]
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        batch_size=1,
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        latents=None,
+        unconditioning=None,
+        neg_prompt=None,
+        ref_intermediate_latents=None,
+        return_intermediates=False,
+        **kwds):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        if kwds.get("dir"):
+            dir = text_embeddings[-2] - text_embeddings[-1]
+            u, s, v = torch.pca_lowrank(dir.transpose(-1, -2), q=1, center=True)
+            text_embeddings[-1] = text_embeddings[-1] + kwds.get("dir") * v
+            print(u.shape)
+            print(v.shape)
+        # define initial latents
+        latents_shape = (batch_size, self.unet.in_channels, height//8, width//8)
+        if latents is None:
+            latents = torch.randn(latents_shape, device=DEVICE)
+        else:
+            assert latents.shape == latents_shape, f"The shape of input latent tensor {latents.shape} should equal to predefined one."
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            if neg_prompt:
+                uc_text = neg_prompt
+            else:
+                uc_text = ""
+            # uc_text = "ugly, tiling, poorly drawn hands, poorly drawn feet, body out of frame, cut off, low contrast, underexposed, distorted face"
+            unconditional_input = self.tokenizer(
+                [uc_text] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            # unconditional_input.input_ids = unconditional_input.input_ids[:, 1:]
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # iterative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        # print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(self.scheduler.timesteps, desc="DDIM Sampler")):
+            if ref_intermediate_latents is not None:
+                # note that the batch_size >= 2
+                latents_ref = ref_intermediate_latents[-1 - i]
+                _, latents_cur = latents.chunk(2)
+                latents = torch.cat([latents_ref, latents_cur])
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            if unconditioning is not None and isinstance(unconditioning, list):
+                _, text_embeddings = text_embeddings.chunk(2)
+                text_embeddings = torch.cat([unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+            # predict tghe noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t -> x_t-1
+            latents, pred_x0 = self.step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        image = self.latent2image(latents, return_type="pt")
+        if return_intermediates:
+            pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            latents_list = [self.latent2image(img, return_type="pt") for img in latents_list]
+            return image, pred_x0_list, latents_list
+        return image
+    @torch.no_grad()
+    def invert(
+        self,
+        image: torch.Tensor,
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        return_intermediates=False,
+        **kwds):
+        """
+        invert a real image into noise map with determinisc DDIM inversion
+        """
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        batch_size = image.shape[0]
+        if isinstance(prompt, list):
+            if batch_size == 1:
+                image = image.expand(len(prompt), -1, -1, -1)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        # define initial latents
+        latents = self.image2latent(image)
+        start_latents = latents
+        # print(latents)
+        # exit()
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            unconditional_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # interative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        # print("attributes: ", self.scheduler.__dict__)
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            # predict the noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t-1 -> x_t
+            latents, pred_x0 = self.next_step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        if return_intermediates:
+            # return the intermediate laters during inversion
+            # pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            return latents, latents_list
+        return latents, start_latents

masactrl/masactrl.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import torch
+import torch.nn.functional as F
+import numpy as np
+from einops import rearrange
+from .masactrl_utils import AttentionBase
+from torchvision.utils import save_image
+class MutualSelfAttentionControl(AttentionBase):
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50):
+        """
+        Mutual self-attention control for Stable-Diffusion model
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+        """
+        super().__init__()
+        self.total_steps = total_steps
+        self.start_step = start_step
+        self.start_layer = start_layer
+        self.layer_idx = layer_idx if layer_idx is not None else list(range(start_layer, 16))
+        self.step_idx = step_idx if step_idx is not None else list(range(start_step, total_steps))
+        print("step_idx: ", self.step_idx)
+        print("layer_idx: ", self.layer_idx)
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        b = q.shape[0] // num_heads
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        attn = sim.softmax(-1)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "h (b n) d -> b n (h d)", b=b)
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u = self.attn_batch(qu, ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c = self.attn_batch(qc, kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        out = torch.cat([out_u, out_c], dim=0)
+        return out
+class MutualSelfAttentionControlMask(MutualSelfAttentionControl):
+    def __init__(self,  start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, mask_s=None, mask_t=None, mask_save_dir=None):
+        """
+        Maske-guided MasaCtrl to alleviate the problem of fore- and background confusion
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            mask_s: source mask with shape (h, w)
+            mask_t: target mask with same shape as source mask
+        """
+        super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps)
+        self.mask_s = mask_s  # source mask with shape (h, w)
+        self.mask_t = mask_t  # target mask with same shape as source mask
+        print("Using mask-guided MasaCtrl")
+        if mask_save_dir is not None:
+            os.makedirs(mask_save_dir, exist_ok=True)
+            save_image(self.mask_s.unsqueeze(0).unsqueeze(0), os.path.join(mask_save_dir, "mask_s.png"))
+            save_image(self.mask_t.unsqueeze(0).unsqueeze(0), os.path.join(mask_save_dir, "mask_t.png"))
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if kwargs.get("is_mask_attn") and self.mask_s is not None:
+            print("masked attention")
+            mask = self.mask_s.unsqueeze(0).unsqueeze(0)
+            mask = F.interpolate(mask, (H, W)).flatten(0).unsqueeze(0)
+            mask = mask.flatten()
+            # background
+            sim_bg = sim + mask.masked_fill(mask == 1, torch.finfo(sim.dtype).min)
+            # object
+            sim_fg = sim + mask.masked_fill(mask == 0, torch.finfo(sim.dtype).min)
+            sim = torch.cat([sim_fg, sim_bg], dim=0)
+        attn = sim.softmax(-1)
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        B = q.shape[0] // num_heads // 2
+        H = W = int(np.sqrt(q.shape[1]))
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u_source = self.attn_batch(qu[:num_heads], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_source = self.attn_batch(qc[:num_heads], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, is_mask_attn=True, **kwargs)
+        out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, is_mask_attn=True, **kwargs)
+        if self.mask_s is not None and self.mask_t is not None:
+            out_u_target_fg, out_u_target_bg = out_u_target.chunk(2, 0)
+            out_c_target_fg, out_c_target_bg = out_c_target.chunk(2, 0)
+            mask = F.interpolate(self.mask_t.unsqueeze(0).unsqueeze(0), (H, W))
+            mask = mask.reshape(-1, 1)  # (hw, 1)
+            out_u_target = out_u_target_fg * mask + out_u_target_bg * (1 - mask)
+            out_c_target = out_c_target_fg * mask + out_c_target_bg * (1 - mask)
+        out = torch.cat([out_u_source, out_u_target, out_c_source, out_c_target], dim=0)
+        return out
+class MutualSelfAttentionControlMaskAuto(MutualSelfAttentionControl):
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, thres=0.1, ref_token_idx=[1], cur_token_idx=[1], mask_save_dir=None):
+        """
+        MasaCtrl with mask auto generation from cross-attention map
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            thres: the thereshold for mask thresholding
+            ref_token_idx: the token index list for cross-attention map aggregation
+            cur_token_idx: the token index list for cross-attention map aggregation
+            mask_save_dir: the path to save the mask image
+        """
+        super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps)
+        print("using MutualSelfAttentionControlMaskAuto")
+        self.thres = thres
+        self.ref_token_idx = ref_token_idx
+        self.cur_token_idx = cur_token_idx
+        self.self_attns = []
+        self.cross_attns = []
+        self.cross_attns_mask = None
+        self.self_attns_mask = None
+        self.mask_save_dir = mask_save_dir
+        if self.mask_save_dir is not None:
+            os.makedirs(self.mask_save_dir, exist_ok=True)
+    def after_step(self):
+        self.self_attns = []
+        self.cross_attns = []
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if self.self_attns_mask is not None:
+            # binarize the mask
+            mask = self.self_attns_mask
+            thres = self.thres
+            mask[mask >= thres] = 1
+            mask[mask < thres] = 0
+            sim_fg = sim + mask.masked_fill(mask == 0, torch.finfo(sim.dtype).min)
+            sim_bg = sim + mask.masked_fill(mask == 1, torch.finfo(sim.dtype).min)
+            sim = torch.cat([sim_fg, sim_bg])
+        attn = sim.softmax(-1)
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def aggregate_cross_attn_map(self, idx):
+        attn_map = torch.stack(self.cross_attns, dim=1).mean(1)  # (B, N, dim)
+        B = attn_map.shape[0]
+        res = int(np.sqrt(attn_map.shape[-2]))
+        attn_map = attn_map.reshape(-1, res, res, attn_map.shape[-1])
+        image = attn_map[..., idx]
+        if isinstance(idx, list):
+            image = image.sum(-1)
+        image_min = image.min(dim=1, keepdim=True)[0].min(dim=2, keepdim=True)[0]
+        image_max = image.max(dim=1, keepdim=True)[0].max(dim=2, keepdim=True)[0]
+        image = (image - image_min) / (image_max - image_min)
+        return image
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross:
+            # save cross attention map with res 16 * 16
+            if attn.shape[1] == 16 * 16:
+                self.cross_attns.append(attn.reshape(-1, num_heads, *attn.shape[-2:]).mean(1))
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        B = q.shape[0] // num_heads // 2
+        H = W = int(np.sqrt(q.shape[1]))
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u_source = self.attn_batch(qu[:num_heads], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_source = self.attn_batch(qc[:num_heads], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        if len(self.cross_attns) == 0:
+            self.self_attns_mask = None
+            out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+            out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        else:
+            mask = self.aggregate_cross_attn_map(idx=self.ref_token_idx)  # (2, H, W)
+            mask_source = mask[-2]  # (H, W)
+            res = int(np.sqrt(q.shape[1]))
+            self.self_attns_mask = F.interpolate(mask_source.unsqueeze(0).unsqueeze(0), (res, res)).flatten()
+            if self.mask_save_dir is not None:
+                H = W = int(np.sqrt(self.self_attns_mask.shape[0]))
+                mask_image = self.self_attns_mask.reshape(H, W).unsqueeze(0)
+                save_image(mask_image, os.path.join(self.mask_save_dir, f"mask_s_{self.cur_step}_{self.cur_att_layer}.png"))
+            out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+            out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        if self.self_attns_mask is not None:
+            mask = self.aggregate_cross_attn_map(idx=self.cur_token_idx)  # (2, H, W)
+            mask_target = mask[-1]  # (H, W)
+            res = int(np.sqrt(q.shape[1]))
+            spatial_mask = F.interpolate(mask_target.unsqueeze(0).unsqueeze(0), (res, res)).reshape(-1, 1)
+            if self.mask_save_dir is not None:
+                H = W = int(np.sqrt(spatial_mask.shape[0]))
+                mask_image = spatial_mask.reshape(H, W).unsqueeze(0)
+                save_image(mask_image, os.path.join(self.mask_save_dir, f"mask_t_{self.cur_step}_{self.cur_att_layer}.png"))
+            # binarize the mask
+            thres = self.thres
+            spatial_mask[spatial_mask >= thres] = 1
+            spatial_mask[spatial_mask < thres] = 0
+            out_u_target_fg, out_u_target_bg = out_u_target.chunk(2)
+            out_c_target_fg, out_c_target_bg = out_c_target.chunk(2)
+            out_u_target = out_u_target_fg * spatial_mask + out_u_target_bg * (1 - spatial_mask)
+            out_c_target = out_c_target_fg * spatial_mask + out_c_target_bg * (1 - spatial_mask)
+            # set self self-attention mask to None
+            self.self_attns_mask = None
+        out = torch.cat([out_u_source, out_u_target, out_c_source, out_c_target], dim=0)
+        return out

masactrl/masactrl_utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union, Tuple, List, Callable, Dict
+from torchvision.utils import save_image
+from einops import rearrange, repeat
+class AttentionBase:
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+    def after_step(self):
+        pass
+    def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            # after step
+            self.after_step()
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = torch.einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
+        return out
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+class AttentionStore(AttentionBase):
+    def __init__(self, res=[32], min_step=0, max_step=1000):
+        super().__init__()
+        self.res = res
+        self.min_step = min_step
+        self.max_step = max_step
+        self.valid_steps = 0
+        self.self_attns = []  # store the all attns
+        self.cross_attns = []
+        self.self_attns_step = []  # store the attns in each step
+        self.cross_attns_step = []
+    def after_step(self):
+        if self.cur_step > self.min_step and self.cur_step < self.max_step:
+            self.valid_steps += 1
+            if len(self.self_attns) == 0:
+                self.self_attns = self.self_attns_step
+                self.cross_attns = self.cross_attns_step
+            else:
+                for i in range(len(self.self_attns)):
+                    self.self_attns[i] += self.self_attns_step[i]
+                    self.cross_attns[i] += self.cross_attns_step[i]
+        self.self_attns_step.clear()
+        self.cross_attns_step.clear()
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        if attn.shape[1] <= 64 ** 2:  # avoid OOM
+            if is_cross:
+                self.cross_attns_step.append(attn)
+            else:
+                self.self_attns_step.append(attn)
+        return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+def regiter_attention_editor_diffusers(model, editor: AttentionBase):
+    """
+    Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'Attention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.unet.named_children():
+        if "down" in net_name:
+            cross_att_count += register_editor(net, 0, "down")
+        elif "mid" in net_name:
+            cross_att_count += register_editor(net, 0, "mid")
+        elif "up" in net_name:
+            cross_att_count += register_editor(net, 0, "up")
+    editor.num_att_layers = cross_att_count
+def regiter_attention_editor_ldm(model, editor: AttentionBase):
+    """
+    Register a attention editor to Stable Diffusion model, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'CrossAttention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.model.diffusion_model.named_children():
+        if "input" in net_name:
+            cross_att_count += register_editor(net, 0, "input")
+        elif "middle" in net_name:
+            cross_att_count += register_editor(net, 0, "middle")
+        elif "output" in net_name:
+            cross_att_count += register_editor(net, 0, "output")
+    editor.num_att_layers = cross_att_count

playground.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MasaCtrl: Tuning-free Mutual Self-Attention Control for Consistent Image Synthesis and Editing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from einops import rearrange, repeat\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "from diffusers import DDIMScheduler\n",
+    "\n",
+    "from masactrl.diffuser_utils import MasaCtrlPipeline\n",
+    "from masactrl.masactrl_utils import AttentionBase\n",
+    "from masactrl.masactrl_utils import regiter_attention_editor_diffusers\n",
+    "\n",
+    "from torchvision.utils import save_image\n",
+    "from torchvision.io import read_image\n",
+    "from pytorch_lightning import seed_everything\n",
+    "\n",
+    "torch.cuda.set_device(6)  # set the GPU device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Model Construction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that you may add your Hugging Face token to get access to the models\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "model_path = \"andite/anything-v4.0\"\n",
+    "# model_path = \"runwayml/stable-diffusion-v1-5\"\n",
+    "scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", clip_sample=False, set_alpha_to_one=False)\n",
+    "model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler, cross_attention_kwargs={\"scale\": 0.5}).to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Consistent synthesis with MasaCtrl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from masactrl.masactrl import MutualSelfAttentionControl\n",
+    "\n",
+    "\n",
+    "seed = 42\n",
+    "seed_everything(seed)\n",
+    "\n",
+    "out_dir = \"./workdir/masactrl_exp/\"\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "sample_count = len(os.listdir(out_dir))\n",
+    "out_dir = os.path.join(out_dir, f\"sample_{sample_count}\")\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "\n",
+    "prompts = [\n",
+    "    \"1boy, casual, outdoors, sitting\",  # source prompt\n",
+    "    \"1boy, casual, outdoors, standing\"  # target prompt\n",
+    "]\n",
+    "\n",
+    "# initialize the noise map\n",
+    "start_code = torch.randn([1, 4, 64, 64], device=device)\n",
+    "start_code = start_code.expand(len(prompts), -1, -1, -1)\n",
+    "\n",
+    "# inference the synthesized image without MasaCtrl\n",
+    "editor = AttentionBase()\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "image_ori = model(prompts, latents=start_code, guidance_scale=7.5)\n",
+    "\n",
+    "# inference the synthesized image with MasaCtrl\n",
+    "STEP = 4\n",
+    "LAYPER = 10\n",
+    "\n",
+    "# hijack the attention module\n",
+    "editor = MutualSelfAttentionControl(STEP, LAYPER)\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "\n",
+    "# inference the synthesized image\n",
+    "image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5)[-1:]\n",
+    "\n",
+    "# save the synthesized image\n",
+    "out_image = torch.cat([image_ori, image_masactrl], dim=0)\n",
+    "save_image(out_image, os.path.join(out_dir, f\"all_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[0], os.path.join(out_dir, f\"source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[1], os.path.join(out_dir, f\"without_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[2], os.path.join(out_dir, f\"masactrl_step{STEP}_layer{LAYPER}.png\"))\n",
+    "\n",
+    "print(\"Syntheiszed images are saved in\", out_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.5 ('ldm')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "587aa04bacead72c1ffd459abbe4c8140b72ba2b534b24165b36a2ede3d95042"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

playground_real.ipynb ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MasaCtrl: Tuning-free Mutual Self-Attention Control for Consistent Image Synthesis and Editing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from einops import rearrange, repeat\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "from diffusers import DDIMScheduler\n",
+    "\n",
+    "from masactrl.diffuser_utils import MasaCtrlPipeline\n",
+    "from masactrl.masactrl_utils import AttentionBase\n",
+    "from masactrl.masactrl_utils import regiter_attention_editor_diffusers\n",
+    "\n",
+    "from torchvision.utils import save_image\n",
+    "from torchvision.io import read_image\n",
+    "from pytorch_lightning import seed_everything\n",
+    "\n",
+    "torch.cuda.set_device(6)  # set the GPU device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Model Construction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that you may add your Hugging Face token to get access to the models\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "# model_path = \"andite/anything-v4.0\"\n",
+    "model_path = \"CompVis/stable-diffusion-v1-4\"\n",
+    "# model_path = \"runwayml/stable-diffusion-v1-5\"\n",
+    "scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", clip_sample=False, set_alpha_to_one=False)\n",
+    "model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler).to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Real editing with MasaCtrl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from masactrl.masactrl import MutualSelfAttentionControl\n",
+    "from torchvision.io import read_image\n",
+    "\n",
+    "\n",
+    "def load_image(image_path, device):\n",
+    "    image = read_image(image_path)\n",
+    "    image = image[:3].unsqueeze_(0).float() / 127.5 - 1.  # [-1, 1]\n",
+    "    image = F.interpolate(image, (512, 512))\n",
+    "    image = image.to(device)\n",
+    "    return image\n",
+    "\n",
+    "\n",
+    "seed = 42\n",
+    "seed_everything(seed)\n",
+    "\n",
+    "out_dir = \"./workdir/masactrl_real_exp/\"\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "sample_count = len(os.listdir(out_dir))\n",
+    "out_dir = os.path.join(out_dir, f\"sample_{sample_count}\")\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "\n",
+    "# source image\n",
+    "SOURCE_IMAGE_PATH = \"./gradio_app/images/corgi.jpg\"\n",
+    "source_image = load_image(SOURCE_IMAGE_PATH, device)\n",
+    "\n",
+    "source_prompt = \"\"\n",
+    "target_prompt = \"a photo of a running corgi\"\n",
+    "prompts = [source_prompt, target_prompt]\n",
+    "\n",
+    "# invert the source image\n",
+    "start_code, latents_list = model.invert(source_image,\n",
+    "                                        source_prompt,\n",
+    "                                        guidance_scale=7.5,\n",
+    "                                        num_inference_steps=50,\n",
+    "                                        return_intermediates=True)\n",
+    "start_code = start_code.expand(len(prompts), -1, -1, -1)\n",
+    "\n",
+    "# results of direct synthesis\n",
+    "editor = AttentionBase()\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "image_fixed = model([target_prompt],\n",
+    "                    latents=start_code[-1:],\n",
+    "                    num_inference_steps=50,\n",
+    "                    guidance_scale=7.5)\n",
+    "\n",
+    "# inference the synthesized image with MasaCtrl\n",
+    "STEP = 4\n",
+    "LAYPER = 10\n",
+    "\n",
+    "# hijack the attention module\n",
+    "editor = MutualSelfAttentionControl(STEP, LAYPER)\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "\n",
+    "# inference the synthesized image\n",
+    "image_masactrl = model(prompts,\n",
+    "                       latents=start_code,\n",
+    "                       guidance_scale=7.5)\n",
+    "# Note: querying the inversion intermediate features latents_list\n",
+    "# may obtain better reconstruction and editing results\n",
+    "# image_masactrl = model(prompts,\n",
+    "#                        latents=start_code,\n",
+    "#                        guidance_scale=7.5,\n",
+    "#                        ref_intermediate_latents=latents_list)\n",
+    "\n",
+    "# save the synthesized image\n",
+    "out_image = torch.cat([source_image * 0.5 + 0.5,\n",
+    "                       image_masactrl[0:1],\n",
+    "                       image_fixed,\n",
+    "                       image_masactrl[-1:]], dim=0)\n",
+    "save_image(out_image, os.path.join(out_dir, f\"all_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[0], os.path.join(out_dir, f\"source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[1], os.path.join(out_dir, f\"reconstructed_source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[2], os.path.join(out_dir, f\"without_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[3], os.path.join(out_dir, f\"masactrl_step{STEP}_layer{LAYPER}.png\"))\n",
+    "\n",
+    "print(\"Syntheiszed images are saved in\", out_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.5 ('ldm')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "587aa04bacead72c1ffd459abbe4c8140b72ba2b534b24165b36a2ede3d95042"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+diffusers==0.15.0
+transformers
+opencv-python

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+    text-align: center;
+  }