Spaces:

ironjr
/

SemanticPaletteXL

Running on Zero

App Files Files Community

ironjr commited on Mar 24

Commit

82bf0c3

•

1 Parent(s): 2a39a67

first commit

Browse files

Files changed (13) hide show

.gitignore +1 -0
README.md +8 -5
app.py +873 -0
examples/prompt_background.txt +8 -0
examples/prompt_background_advanced.txt +0 -0
examples/prompt_boy.txt +15 -0
examples/prompt_girl.txt +16 -0
examples/prompt_props.txt +43 -0
model.py +1410 -0
prompt_util.py +154 -0
requirements.txt +14 -0
share_btn.py +59 -0
util.py +315 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .ipynb_checkpoints/*

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
 ---
-title: SemanticPaletteXL
-emoji: 📉
 colorFrom: red
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.22.0
 app_file: app.py
-pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SemanticPalette X Animagine XL 3.1
+emoji: 🔥🧠🎨🔥
 colorFrom: red
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.21.0
 app_file: app.py
+pinned: true
 license: mit
+models:
+- cagliostrolab/animagine-xl-3.1
+- ByteDance/SDXL-Lightning
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,873 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import sys
+sys.path.append('../../src')
+import argparse
+import random
+import time
+import json
+import os
+import glob
+import pathlib
+from functools import partial
+from pprint import pprint
+import numpy as np
+from PIL import Image
+import torch
+import gradio as gr
+from huggingface_hub import snapshot_download
+from model import StableMultiDiffusionSDXLPipeline
+from util import seed_everything
+from prompt_util import preprocess_prompts, _quality_dict, _style_dict
+from share_btn import community_icon_html, loading_icon_html, share_js
+### Utils
+def log_state(state):
+    pprint(vars(opt))
+    if isinstance(state, gr.State):
+        state = state.value
+    pprint(vars(state))
+def is_empty_image(im: Image.Image) -> bool:
+    if im is None:
+        return True
+    im = np.array(im)
+    has_alpha = (im.shape[2] == 4)
+    if not has_alpha:
+        return False
+    elif im.sum() == 0:
+        return True
+    else:
+        return False
+### Argument passing
+parser = argparse.ArgumentParser(description='Semantic Palette demo powered by StreamMultiDiffusion with SDXL support.')
+parser.add_argument('-H', '--height', type=int, default=1024)
+parser.add_argument('-W', '--width', type=int, default=2560)
+parser.add_argument('--model', type=str, default=None, help='Hugging face model repository or local path for a SD1.5 model checkpoint to run.')
+parser.add_argument('--bootstrap_steps', type=int, default=1)
+parser.add_argument('--seed', type=int, default=-1)
+parser.add_argument('--device', type=int, default=0)
+parser.add_argument('--port', type=int, default=8000)
+opt = parser.parse_args()
+### Global variables and data structures
+device = f'cuda:{opt.device}' if opt.device >= 0 else 'cpu'
+if opt.model is None:
+    model_dict = {
+        'Animagine XL 3.1': 'cagliostrolab/animagine-xl-3.1',
+    }
+else:
+    if opt.model.endswith('.safetensors'):
+        opt.model = os.path.abspath(os.path.join('checkpoints', opt.model))
+    model_dict = {os.path.splitext(os.path.basename(opt.model))[0]: opt.model}
+models = {
+    k: StableMultiDiffusionSDXLPipeline(device, hf_key=v, has_i2t=False)
+    for k, v in model_dict.items()
+}
+prompt_suggestions = [
+    '1girl, souryuu asuka langley, neon genesis evangelion, solo, upper body, v, smile, looking at viewer',
+    '1boy, solo, portrait, looking at viewer, white t-shirt, brown hair',
+    '1girl, arima kana, oshi no ko, solo, upper body, from behind',
+]
+opt.max_palettes = 5
+opt.default_prompt_strength = 1.0
+opt.default_mask_strength = 1.0
+opt.default_mask_std = 0.0
+opt.default_negative_prompt = (
+    'nsfw, worst quality, bad quality, normal quality, cropped, framed'
+)
+opt.verbose = True
+opt.colors = [
+    '#000000',
+    '#2692F3',
+    '#F89E12',
+    '#16C232',
+    '#F92F6C',
+    '#AC6AEB',
+    # '#92C62C',
+    # '#92C6EC',
+    # '#FECAC0',
+]
+### Event handlers
+def add_palette(state):
+    old_actives = state.active_palettes
+    state.active_palettes = min(state.active_palettes + 1, opt.max_palettes)
+    if opt.verbose:
+        log_state(state)
+    if state.active_palettes != old_actives:
+        return [state] + [
+            gr.update() if state.active_palettes != opt.max_palettes else gr.update(visible=False)
+        ] + [
+            gr.update() if i != state.active_palettes - 1 else gr.update(value=state.prompt_names[i + 1], visible=True)
+            for i in range(opt.max_palettes)
+        ]
+    else:
+        return [state] + [gr.update() for i in range(opt.max_palettes + 1)]
+def select_palette(state, button, idx):
+    if idx < 0 or idx > opt.max_palettes:
+        idx = 0
+    old_idx = state.current_palette
+    if old_idx == idx:
+        return [state] + [gr.update() for _ in range(opt.max_palettes + 7)]
+    state.current_palette = idx
+    if opt.verbose:
+        log_state(state)
+    updates = [state] + [
+        gr.update() if i not in (idx, old_idx) else
+        gr.update(variant='secondary') if i == old_idx else gr.update(variant='primary')
+        for i in range(opt.max_palettes + 1)
+    ]
+    label = 'Background' if idx == 0 else f'Palette {idx}'
+    updates.extend([
+        gr.update(value=button, interactive=(idx > 0)),
+        gr.update(value=state.prompts[idx], label=f'Edit Prompt for {label}'),
+        gr.update(value=state.neg_prompts[idx], label=f'Edit Negative Prompt for {label}'),
+        (
+            gr.update(value=state.mask_strengths[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_mask_strength, interactive=False)
+        ),
+        (
+            gr.update(value=state.prompt_strengths[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_prompt_strength, interactive=False)
+        ),
+        (
+            gr.update(value=state.mask_stds[idx - 1], interactive=True) if idx > 0 else
+            gr.update(value=opt.default_mask_std, interactive=False)
+        ),
+    ])
+    return updates
+def change_prompt_strength(state, strength):
+    if state.current_palette == 0:
+        return state
+    state.prompt_strengths[state.current_palette - 1] = strength
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_std(state, std):
+    if state.current_palette == 0:
+        return state
+    state.mask_stds[state.current_palette - 1] = std
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_mask_strength(state, strength):
+    if state.current_palette == 0:
+        return state
+    state.mask_strengths[state.current_palette - 1] = strength
+    if opt.verbose:
+        log_state(state)
+    return state
+def reset_seed(state, seed):
+    state.seed = seed
+    if opt.verbose:
+        log_state(state)
+    return state
+def rename_prompt(state, name):
+    state.prompt_names[state.current_palette] = name
+    if opt.verbose:
+        log_state(state)
+    return [state] + [
+        gr.update() if i != state.current_palette else gr.update(value=name)
+        for i in range(opt.max_palettes + 1)
+    ]
+def change_prompt(state, prompt):
+    state.prompts[state.current_palette] = prompt
+    if opt.verbose:
+        log_state(state)
+    return state
+def change_neg_prompt(state, neg_prompt):
+    state.neg_prompts[state.current_palette] = neg_prompt
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_model(state, model_id):
+    state.model_id = model_id
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_style(state, style_name):
+    state.style_name = style_name
+    if opt.verbose:
+        log_state(state)
+    return state
+def select_quality(state, quality_name):
+    state.quality_name = quality_name
+    if opt.verbose:
+        log_state(state)
+    return state
+def import_state(state, json_text):
+    current_palette = state.current_palette
+    # active_palettes = state.active_palettes
+    state = argparse.Namespace(**json.loads(json_text))
+    state.active_palettes = opt.max_palettes
+    return [state] + [
+        gr.update(value=v, visible=True) for v in state.prompt_names
+    ] + [
+        state.model_id,
+        state.style_name,
+        state.quality_name,
+        state.prompts[current_palette],
+        state.prompt_names[current_palette],
+        state.neg_prompts[current_palette],
+        state.prompt_strengths[current_palette - 1],
+        state.mask_strengths[current_palette - 1],
+        state.mask_stds[current_palette - 1],
+        state.seed,
+    ]
+### Main worker
+def generate(state, *args, **kwargs):
+    return models[state.model_id](*args, **kwargs)
+def run(state, drawpad):
+    seed_everything(state.seed if state.seed >=0 else np.random.randint(2147483647))
+    print('Generate!')
+    background = drawpad['background'].convert('RGBA')
+    inpainting_mode = np.asarray(background).sum() != 0
+    print('Inpainting mode: ', inpainting_mode)
+    user_input = np.asarray(drawpad['layers'][0]) # (H, W, 4)
+    foreground_mask = torch.tensor(user_input[..., -1])[None, None] # (1, 1, H, W)
+    user_input = torch.tensor(user_input[..., :-1]) # (H, W, 3)
+    palette = torch.tensor([
+        tuple(int(s[i+1:i+3], 16) for i in (0, 2, 4))
+        for s in opt.colors[1:]
+    ]) # (N, 3)
+    masks = (palette[:, None, None, :] == user_input[None]).all(dim=-1)[:, None, ...] # (N, 1, H, W)
+    has_masks = [i for i, m in enumerate(masks.sum(dim=(1, 2, 3)) == 0) if not m]
+    print('Has mask: ', has_masks)
+    masks = masks * foreground_mask
+    masks = masks[has_masks]
+    if inpainting_mode:
+        prompts = [state.prompts[v + 1] for v in has_masks]
+        negative_prompts = [state.neg_prompts[v + 1] for v in has_masks]
+        mask_strengths = [state.mask_strengths[v] for v in has_masks]
+        mask_stds = [state.mask_stds[v] for v in has_masks]
+        prompt_strengths = [state.prompt_strengths[v] for v in has_masks]
+    else:
+        masks = torch.cat([torch.ones_like(foreground_mask), masks], dim=0)
+        prompts = [state.prompts[0]] + [state.prompts[v + 1] for v in has_masks]
+        negative_prompts = [state.neg_prompts[0]] + [state.neg_prompts[v + 1] for v in has_masks]
+        mask_strengths = [1] + [state.mask_strengths[v] for v in has_masks]
+        mask_stds = [0] + [state.mask_stds[v] for v in has_masks]
+        prompt_strengths = [1] + [state.prompt_strengths[v] for v in has_masks]
+    prompts, negative_prompts = preprocess_prompts(
+        prompts, negative_prompts, style_name=state.style_name, quality_name=state.quality_name)
+    return generate(
+        state,
+        prompts,
+        negative_prompts,
+        masks=masks,
+        mask_strengths=mask_strengths,
+        mask_stds=mask_stds,
+        prompt_strengths=prompt_strengths,
+        background=background.convert('RGB'),
+        background_prompt=state.prompts[0],
+        background_negative_prompt=state.neg_prompts[0],
+        height=opt.height,
+        width=opt.width,
+        bootstrap_steps=2,
+        guidance_scale=0,
+    )
+### Load examples
+root = pathlib.Path(__file__).parent
+print(root)
+example_root = os.path.join(root, 'examples')
+example_images = glob.glob(os.path.join(example_root, '*.png'))
+example_images = [Image.open(i) for i in example_images]
+with open(os.path.join(example_root, 'prompt_background_advanced.txt')) as f:
+    prompts_background = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_girl.txt')) as f:
+    prompts_girl = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_boy.txt')) as f:
+    prompts_boy = [l.strip() for l in f.readlines() if l.strip() != '']
+with open(os.path.join(example_root, 'prompt_props.txt')) as f:
+    prompts_props = [l.strip() for l in f.readlines() if l.strip() != '']
+    prompts_props = {l.split(',')[0].strip(): ','.join(l.split(',')[1:]).strip() for l in prompts_props}
+prompt_background = lambda: random.choice(prompts_background)
+prompt_girl = lambda: random.choice(prompts_girl)
+prompt_boy = lambda: random.choice(prompts_boy)
+prompt_props = lambda: np.random.choice(list(prompts_props.keys()), size=(opt.max_palettes - 2), replace=False).tolist()
+### Main application
+css = f"""
+#run-button {{
+    font-size: 30pt;
+    background-image: linear-gradient(to right, #4338ca 0%, #26a0da 51%, #4338ca 100%);
+    margin: 0;
+    padding: 15px 45px;
+    text-align: center;
+    text-transform: uppercase;
+    transition: 0.5s;
+    background-size: 200% auto;
+    color: white;
+    box-shadow: 0 0 20px #eee;
+    border-radius: 10px;
+    display: block;
+    background-position: right center;
+}}
+#run-button:hover {{
+    background-position: left center;
+    color: #fff;
+    text-decoration: none;
+}}
+#semantic-palette {{
+    border-style: solid;
+    border-width: 0.2em;
+    border-color: #eee;
+}}
+#semantic-palette:hover {{
+    box-shadow: 0 0 20px #eee;
+}}
+#output-screen {{
+    width: 100%;
+    aspect-ratio: {opt.width} / {opt.height};
+}}
+.layer-wrap {{
+    display: none;
+}}
+"""
+for i in range(opt.max_palettes + 1):
+    css = css + f"""
+.secondary#semantic-palette-{i} {{
+    background-image: linear-gradient(to right, #374151 0%, #374151 71%, {opt.colors[i]} 100%);
+    color: white;
+}}
+.primary#semantic-palette-{i} {{
+    background-image: linear-gradient(to right, #4338ca 0%, #4338ca 71%, {opt.colors[i]} 100%);
+    color: white;
+}}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    iface = argparse.Namespace()
+    def _define_state():
+        state = argparse.Namespace()
+        # Cursor.
+        state.current_palette = 0 # 0: Background; 1,2,3,...: Layers
+        state.model_id = list(model_dict.keys())[0]
+        state.style_name = '(None)'
+        state.quality_name = 'Standard v3.1'
+        # State variables (one-hot).
+        state.active_palettes = 1
+        # Front-end initialized to the default values.
+        prompt_props_ = prompt_props()
+        state.prompt_names = [
+            '🌄 Background',
+            '👧 Girl',
+            '👦 Boy',
+        ] + prompt_props_ + ['🎨 New Palette' for _ in range(opt.max_palettes - 5)]
+        state.prompts = [
+            prompt_background(),
+            prompt_girl(),
+            prompt_boy(),
+        ] + [prompts_props[k] for k in prompt_props_] + ['' for _ in range(opt.max_palettes - 5)]
+        state.neg_prompts = [
+            opt.default_negative_prompt
+            + (', humans, humans, humans' if i == 0 else '')
+            for i in range(opt.max_palettes + 1)
+        ]
+        state.prompt_strengths = [opt.default_prompt_strength for _ in range(opt.max_palettes)]
+        state.mask_strengths = [opt.default_mask_strength for _ in range(opt.max_palettes)]
+        state.mask_stds = [opt.default_mask_std for _ in range(opt.max_palettes)]
+        state.seed = opt.seed
+        return state
+    state = gr.State(value=_define_state)
+    ### Demo user interface
+    gr.HTML(
+        """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+    <div>
+        <h1>🧠 Semantic Paint X Animagine XL 3.1 🎨</h1>
+        <h5 style="margin: 0;">powered by</h5>
+        <h3>StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based Semantic Control</h3>
+        <h5 style="margin: 0;">and</h5>
+        <h3>Animagine XL 3.1 by Cagliostro Research Lab</h3>
+        <h5 style="margin: 0;">If you ❤️ our project, please visit our Github and give us a 🌟!</h5>
+        </br>
+        <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+            <a href='https://arxiv.org/abs/2403.09055'>
+                <img src="https://img.shields.io/badge/arXiv-2403.09055-red">
+            </a>
+            &nbsp;
+            <a href='https://jaerinlee.com/research/StreamMultiDiffusion'>
+                <img src='https://img.shields.io/badge/Project-Page-green' alt='Project Page'>
+            </a>
+            &nbsp;
+            <a href='https://github.com/ironjr/StreamMultiDiffusion'>
+                <img src='https://img.shields.io/github/stars/ironjr/StreamMultiDiffusion?label=Github&color=blue'>
+            </a>
+            &nbsp;
+            <a href='https://twitter.com/_ironjr_'>
+                <img src='https://img.shields.io/twitter/url?label=_ironjr_&url=https%3A%2F%2Ftwitter.com%2F_ironjr_'>
+            </a>
+            &nbsp;
+            <a href='https://github.com/ironjr/StreamMultiDiffusion/blob/main/LICENSE'>
+                <img src='https://img.shields.io/badge/license-MIT-lightgrey'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/papers/2403.09055'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Paper-yellow'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/spaces/ironjr/SemanticPalette'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Demo-v1.5-yellow'>
+            </a>
+            &nbsp;
+            <a href='https://huggingface.co/cagliostrolab/animagine-xl-3.1'>
+                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Model-AnimagineXL3.1-yellow'>
+            </a>
+        </div>
+    </div>
+</div>
+<div>
+    </br>
+</div>
+        """
+    )
+    with gr.Row():
+        iface.image_slot = gr.Image(
+            interactive=False,
+            show_label=False,
+            show_download_button=True,
+            type='pil',
+            label='Generated Result',
+            elem_id='output-screen',
+            value=lambda: random.choice(example_images),
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group(elem_id='semantic-palette'):
+                gr.HTML(
+                    """
+<div style="justify-content: center; align-items: center;">
+    <br/>
+    <h3 style="margin: 0; text-align: center;"><b>🧠 Semantic Palette 🎨</b></h3>
+    <br/>
+</div>
+                    """
+                )
+                iface.btn_semantics = [gr.Button(
+                    value=state.value.prompt_names[0],
+                    variant='primary',
+                    elem_id='semantic-palette-0',
+                )]
+                for i in range(opt.max_palettes):
+                    iface.btn_semantics.append(gr.Button(
+                        value=state.value.prompt_names[i + 1],
+                        variant='secondary',
+                        visible=(i < state.value.active_palettes),
+                        elem_id=f'semantic-palette-{i + 1}'
+                    ))
+                iface.btn_add_palette = gr.Button(
+                    value='Create New Semantic Brush',
+                    variant='primary',
+                )
+            with gr.Accordion(label='Import/Export Semantic Palette', open=False):
+                iface.tbox_state_import = gr.Textbox(label='Put Palette JSON Here To Import')
+                iface.json_state_export = gr.JSON(label='Exported Palette')
+                iface.btn_export_state = gr.Button("Export Palette ➡️ JSON", variant='primary')
+                iface.btn_import_state = gr.Button("Import JSON ➡️ Palette", variant='secondary')
+            gr.HTML(
+                """
+<div>
+</br>
+</div>
+<div style="justify-content: center; align-items: center;">
+<h3 style="margin: 0; text-align: center;"><b>❓Usage❓</b></h3>
+</br>
+<div style="justify-content: center; align-items: left; text-align: left;">
+    <p>1-1. Type in the background prompt. Background is not required if you paint the whole drawpad.</p>
+    <p>1-2. (Optional: <em><b>Inpainting mode</b></em>) Uploading a background image will make the app into inpainting mode. Removing the image returns to the creation mode. In the inpainting mode, increasing the <em>Mask Blur STD</em> > 8 for every colored palette is recommended for smooth boundaries.</p>
+    <p>2. Select a semantic brush by clicking onto one in the <b>Semantic Palette</b> above. Edit prompt for the semantic brush.</p>
+    <p>2-1. If you are willing to draw more diverse images, try <b>Create New Semantic Brush</b>.</p>
+    <p>3. Start drawing in the <b>Semantic Drawpad</b> tab. The brush color is directly linked to the semantic brushes.</p>
+    <p>4. Click [<b>GENERATE!</b>] button to create your (large-scale) artwork!</p>
+</div>
+</div>
+                """
+            )
+            gr.HTML(
+                """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+<h5 style="margin: 0;"><b>... or run in your own 🤗 space!</b></h5>
+</div>
+                """
+            )
+            gr.DuplicateButton()
+        with gr.Column(scale=4):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    iface.ctrl_semantic = gr.ImageEditor(
+                        image_mode='RGBA',
+                        sources=['upload', 'clipboard', 'webcam'],
+                        transforms=['crop'],
+                        crop_size=(opt.width, opt.height),
+                        brush=gr.Brush(
+                            colors=opt.colors[1:],
+                            color_mode="fixed",
+                        ),
+                        type='pil',
+                        label='Semantic Drawpad',
+                        elem_id='drawpad',
+                    )
+                with gr.Column(scale=1):
+                    iface.btn_generate = gr.Button(
+                        value='Generate!',
+                        variant='primary',
+                        # scale=1,
+                        elem_id='run-button'
+                    )
+                    with gr.Group(elem_id="share-btn-container"):
+                        gr.HTML(community_icon_html)
+                        gr.HTML(loading_icon_html)
+                        iface.btn_share = gr.Button("Share with Community", elem_id="share-btn")
+                    iface.model_select = gr.Radio(
+                        list(model_dict.keys()),
+                        label='Stable Diffusion Checkpoint',
+                        info='Choose your favorite style.',
+                        value=state.value.model_id,
+                    )
+                    with gr.Accordion(label='Prompt Engineering', open=True):
+                        iface.quality_select = gr.Dropdown(
+                            label='Quality Presets',
+                            interactive=True,
+                            choices=list(_quality_dict.keys()),
+                            value='Standard v3.1',
+                        )
+                        iface.style_select = gr.Radio(
+                            label='Style Preset',
+                            container=True,
+                            interactive=True,
+                            choices=list(_style_dict.keys()),
+                            value='(None)',
+                        )
+            with gr.Group(elem_id='control-panel'):
+                with gr.Row():
+                    iface.tbox_prompt = gr.Textbox(
+                        label='Edit Prompt for Background',
+                        info='What do you want to draw?',
+                        value=state.value.prompts[0],
+                        placeholder=lambda: random.choice(prompt_suggestions),
+                        scale=2,
+                    )
+                    iface.tbox_name = gr.Textbox(
+                        label='Edit Brush Name',
+                        info='Just for your convenience.',
+                        value=state.value.prompt_names[0],
+                        placeholder='🌄 Background',
+                        scale=1,
+                    )
+                with gr.Row():
+                    iface.tbox_neg_prompt = gr.Textbox(
+                        label='Edit Negative Prompt for Background',
+                        info='Add unwanted objects for this semantic brush.',
+                        value=opt.default_negative_prompt,
+                        scale=2,
+                    )
+                    iface.slider_strength = gr.Slider(
+                        label='Prompt Strength',
+                        info='Blends fg & bg in the prompt level, >0.8 Preferred.',
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=opt.default_prompt_strength,
+                        scale=1,
+                    )
+                with gr.Row():
+                    iface.slider_alpha = gr.Slider(
+                        label='Mask Alpha',
+                        info='Factor multiplied to the mask before quantization. Extremely sensitive, >0.98 Preferred.',
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=opt.default_mask_strength,
+                    )
+                    iface.slider_std = gr.Slider(
+                        label='Mask Blur STD',
+                        info='Blends fg & bg in the latent level, 0 for generation, 8-32 for inpainting.',
+                        minimum=0.0001,
+                        maximum=100.0,
+                        value=opt.default_mask_std,
+                    )
+                    iface.slider_seed = gr.Slider(
+                        label='Seed',
+                        info='The global seed.',
+                        minimum=-1,
+                        maximum=2147483647,
+                        step=1,
+                        value=opt.seed,
+                    )
+    ### Attach event handlers
+    for idx, btn in enumerate(iface.btn_semantics):
+        btn.click(
+            fn=partial(select_palette, idx=idx),
+            inputs=[state, btn],
+            outputs=[state] + iface.btn_semantics + [
+                iface.tbox_name,
+                iface.tbox_prompt,
+                iface.tbox_neg_prompt,
+                iface.slider_alpha,
+                iface.slider_strength,
+                iface.slider_std,
+            ],
+            api_name=f'select_palette_{idx}',
+        )
+    iface.btn_add_palette.click(
+        fn=add_palette,
+        inputs=state,
+        outputs=[state, iface.btn_add_palette] + iface.btn_semantics[1:],
+        api_name='create_new',
+    )
+    iface.btn_generate.click(
+        fn=run,
+        inputs=[state, iface.ctrl_semantic],
+        outputs=iface.image_slot,
+        api_name='run',
+    )
+    iface.slider_alpha.input(
+        fn=change_mask_strength,
+        inputs=[state, iface.slider_alpha],
+        outputs=state,
+        api_name='change_alpha',
+    )
+    iface.slider_std.input(
+        fn=change_std,
+        inputs=[state, iface.slider_std],
+        outputs=state,
+        api_name='change_std',
+    )
+    iface.slider_strength.input(
+        fn=change_prompt_strength,
+        inputs=[state, iface.slider_strength],
+        outputs=state,
+        api_name='change_strength',
+    )
+    iface.slider_seed.input(
+        fn=reset_seed,
+        inputs=[state, iface.slider_seed],
+        outputs=state,
+        api_name='reset_seed',
+    )
+    iface.tbox_name.input(
+        fn=rename_prompt,
+        inputs=[state, iface.tbox_name],
+        outputs=[state] + iface.btn_semantics,
+        api_name='prompt_rename',
+    )
+    iface.tbox_prompt.input(
+        fn=change_prompt,
+        inputs=[state, iface.tbox_prompt],
+        outputs=state,
+        api_name='prompt_edit',
+    )
+    iface.tbox_neg_prompt.input(
+        fn=change_neg_prompt,
+        inputs=[state, iface.tbox_neg_prompt],
+        outputs=state,
+        api_name='neg_prompt_edit',
+    )
+    iface.model_select.change(
+        fn=select_model,
+        inputs=[state, iface.model_select],
+        outputs=state,
+        api_name='model_select',
+    )
+    iface.style_select.change(
+        fn=select_style,
+        inputs=[state, iface.style_select],
+        outputs=state,
+        api_name='style_select',
+    )
+    iface.quality_select.change(
+        fn=select_quality,
+        inputs=[state, iface.quality_select],
+        outputs=state,
+        api_name='quality_select',
+    )
+    iface.btn_share.click(None, [], [], _js=share_js)
+    iface.btn_export_state.click(lambda x: vars(x), state, iface.json_state_export)
+    iface.btn_import_state.click(import_state, [state, iface.tbox_state_import], [
+        state,
+        *iface.btn_semantics,
+        iface.model_select,
+        iface.style_select,
+        iface.quality_select,
+        iface.tbox_prompt,
+        iface.tbox_name,
+        iface.tbox_neg_prompt,
+        iface.slider_strength,
+        iface.slider_alpha,
+        iface.slider_std,
+        iface.slider_seed,
+    ])
+    gr.HTML(
+        """
+<div class="footer">
+    <p>We thank <a href="https://cagliostrolab.net/">Cagliostro Research Lab</a> for their permission to use <a href="https://huggingface.co/cagliostrolab/animagine-xl-3.1">Animagine XL 3.1</a> model under academic purpose.
+    Note that the MIT license only applies to StreamMultiDiffusion and Semantic Palette demo app, but not Animagine XL 3.1 model, which is distributed under <a href="https://freedevproject.org/faipl-1.0-sd/">Fair AI Public License 1.0-SD</a>.
+    </p>
+</div>
+       """
+    )
+if __name__ == '__main__':
+    demo.launch(server_port=opt.port)

examples/prompt_background.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Maximalism, best quality, high quality, no humans, background, clear sky, ㅠblack sky, starry universe, planets
+Maximalism, best quality, high quality, no humans, background, clear sky, blue sky
+Maximalism, best quality, high quality, no humans, background, universe, void, black, galaxy, galaxy, stars, stars, stars
+Maximalism, best quality, high quality, no humans, background, galaxy
+Maximalism, best quality, high quality, no humans, background, sky, daylight
+Maximalism, best quality, high quality, no humans, background, skyscrappers, rooftop, city of light, helicopters, bright night, sky
+Maximalism, best quality, high quality, flowers, flowers, flowers, flower garden, no humans, background
+Maximalism, best quality, high quality, flowers, flowers, flowers, flower garden

examples/prompt_background_advanced.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/prompt_boy.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+1boy, looking at viewer, brown hair, blue shirt
+1boy, looking at viewer, brown hair, red shirt
+1boy, looking at viewer, brown hair, purple shirt
+1boy, looking at viewer, brown hair, orange shirt
+1boy, looking at viewer, brown hair, yellow shirt
+1boy, looking at viewer, brown hair, green shirt
+1boy, looking back, side shaved hair, cyberpunk cloths, robotic suit, large body
+1boy, looking back, short hair, renaissance cloths, noble boy
+1boy, looking back, long hair, ponytail, leather jacket, heavy metal boy
+1boy, looking at viewer, a king, kingly grace, majestic cloths, crown
+1boy, looking at viewer, an astronaut, brown hair, faint smile, engineer
+1boy, looking at viewer, a medieval knight, helmet, swordman, plate armour
+1boy, looking at viewer, black haired, old eastern cloth
+1boy, looking back, messy hair, suit, short beard, noir
+1boy, looking at viewer, cute face, light smile, starry eyes, jeans

examples/prompt_girl.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+1girl, looking at viewer, pretty face, light smile, haughty smile, proud, long wavy hair, charcoal dark eyes, chinese cloths
+1girl, looking at viewer, princess, pretty face, light smile, haughty smile, proud, long wavy hair, charcoal dark eyes, majestic gown
+1girl, looking at viewer, astronaut girl, long red hair, space suit, black starry eyes, happy face, pretty face
+1girl, looking at viewer, fantasy adventurer, backpack
+1girl, looking at viewer, astronaut girl, spacesuit, eva, happy face
+1girl, looking at viewer, soldier, rusty cloths, backpack, pretty face, sad smile, tears
+1girl, looking at viewer, majestic cloths, long hair, glittering eye, pretty face
+1girl, looking at viewer, from behind, majestic cloths, long hair, glittering eye
+1girl, looking at viewer, evil smile, very short hair, suit, evil genius
+1girl, looking at viewer, elven queen, green hair, haughty face, eyes wide open, crazy smile, brown jacket, leaves
+1girl, looking at viewer, purple hair, happy face, black leather jacket
+1girl, looking at viewer, pink hair, happy face, blue jeans, black leather jacket
+1girl, looking at viewer, knight, medium length hair, red hair, plate armour, blue eyes, sad, pretty face, determined face
+1girl, looking at viewer, pretty face, light smile, orange hair, casual cloths
+1girl, looking at viewer, pretty face, large smile, open mouth, uniform, mcdonald employee, short wavy hair
+1girl, looking at viewer, brown hair, ponytail, happy face, bright smile, blue jeans and white shirt

examples/prompt_props.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+🏯 Palace, Gyeongbokgung palace
+🌳 Garden, Chinese garden
+🏛️ Rome, Ancient city of Rome
+🧱 Wall, Castle wall
+🔴 Mars, Martian desert, Red rocky desert
+🌻 Grassland, Grasslands
+🏡 Village, A fantasy village
+🐉 Dragon, a flying chinese dragon
+🌏 Earth, Earth seen from ISS
+🚀 Space Station, the international space station
+🪻 Grassland, Rusty grassland with flowers
+🖼️ Tapestry, majestic tapestry, glittering effect, glowing in light, mural painting with mountain
+🏙️ City Ruin, city, ruins, ruins, ruins, deserted
+🏙️ Renaissance City, renaissance city, renaissance city, renaissance city
+🌷 Flowers, Flower garden
+🌼 Flowers, Flower garden, spring garden
+🌹 Flowers, Flowers flowers, flowers
+⛰️ Dolomites Mountains, Dolomites
+⛰️ Himalayas Mountains, Himalayas
+⛰️ Alps Mountains, Alps
+⛰️ Mountains, Mountains
+❄️⛰️ Mountains, Winter mountains
+🌷⛰️ Mountains, Spring mountains
+🌞⛰️ Mountains, Summer mountains
+🌵 Desert, A sandy desert, dunes
+🪨🌵 Desert, A rocky desert
+💦 Waterfall, A giant waterfall
+🌊 Ocean, Ocean
+⛱️ Seashore, Seashore
+🌅 Sea Horizon, Sea horizon
+🌊 Lake, Clear blue lake
+💻 Computer, A giant supecomputer
+🌳 Tree, A giant tree
+🌳 Forest, A forest
+🌳🌳 Forest, A dense forest
+🌲 Forest, Winter forest
+🌴 Forest, Summer forest, tropical forest
+👒 Hat, A hat
+🐶 Dog, Doggy body parts
+😻 Cat, A cat
+🦉 Owl, A small sitting owl
+🦅 Eagle, A small sitting eagle
+🚀 Rocket, A flying rocket

model.py ADDED Viewed

	@@ -0,0 +1,1410 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from diffusers import (
+    AutoencoderTiny,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+    EulerDiscreteScheduler,
+)
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.loaders import (
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    logging,
+)
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from einops import rearrange
+from typing import Tuple, List, Literal, Optional, Union
+from tqdm import tqdm
+from PIL import Image
+from util import gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class StableMultiDiffusionSDXLPipeline(nn.Module):
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype = torch.float16,
+        hf_key: Optional[str] = None,
+        lora_key: Optional[str] = None,
+        load_from_local: bool = False, # Turn on if you have already downloaed LoRA & Hugging Face hub is down.
+        default_mask_std: float = 1.0, # 8.0
+        default_mask_strength: float = 1.0,
+        default_prompt_strength: float = 1.0, # 8.0
+        default_bootstrap_steps: int = 1,
+        default_boostrap_mix_steps: float = 1.0,
+        default_bootstrap_leak_sensitivity: float = 0.2,
+        default_preprocess_mask_cover_alpha: float = 0.3,
+        t_index_list: List[int] = [0, 4, 12, 25, 37], # [0, 5, 16, 18, 20, 37], # # [0, 12, 25, 37], # Magic number.
+        mask_type: Literal['discrete', 'semi-continuous', 'continuous'] = 'discrete',
+        has_i2t: bool = True,
+        lora_weight: float = 1.0,
+    ) -> None:
+        r"""Stabilized MultiDiffusion for fast sampling.
+        Accelrated region-based text-to-image synthesis with Latent Consistency
+        Model while preserving mask fidelity and quality.
+        Args:
+            device (torch.device): Specify CUDA device.
+            hf_key (Optional[str]): Custom StableDiffusion checkpoint for
+                stylized generation.
+            lora_key (Optional[str]): Custom Lightning LoRA for acceleration.
+            load_from_local (bool): Turn on if you have already downloaed LoRA
+                & Hugging Face hub is down.
+            default_mask_std (float): Preprocess mask with Gaussian blur with
+                specified standard deviation.
+            default_mask_strength (float): Preprocess mask by multiplying it
+                globally with the specified variable. Caution: extremely
+                sensitive. Recommended range: 0.98-1.
+            default_prompt_strength (float): Preprocess foreground prompts
+                globally by linearly interpolating its embedding with the
+                background prompt embeddint with specified mix ratio. Useful
+                control handle for foreground blending. Recommended range:
+                0.5-1.
+            default_bootstrap_steps (int): Bootstrapping stage steps to
+                encourage region separation. Recommended range: 1-3.
+            default_boostrap_mix_steps (float): Bootstrapping background is a
+                linear interpolation between background latent and the white
+                image latent. This handle controls the mix ratio. Available
+                range: 0-(number of bootstrapping inference steps). For
+                example, 2.3 means that for the first two steps, white image
+                is used as a bootstrapping background and in the third step,
+                mixture of white (0.3) and registered background (0.7) is used
+                as a bootstrapping background.
+            default_bootstrap_leak_sensitivity (float): Postprocessing at each
+                inference step by masking away the remaining bootstrap
+                backgrounds t Recommended range: 0-1.
+            default_preprocess_mask_cover_alpha (float): Optional preprocessing
+                where each mask covered by other masks is reduced in its alpha
+                value by this specified factor.
+            t_index_list (List[int]): The default scheduling for LCM scheduler.
+            mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
+                defines the mask quantization modes. Details in the codes of
+                `self.process_mask`. Basically, this (subtly) controls the
+                smoothness of foreground-background blending. More continuous
+                means more blending, but smaller generated patch depending on
+                the mask standard deviation.
+            has_i2t (bool): Automatic background image to text prompt con-
+                version with BLIP-2 model. May not be necessary for the non-
+                streaming application.
+            lora_weight (float): Adjusts weight of the LCM/Lightning LoRA.
+                Heavily affects the overall quality!
+        """
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.default_mask_std = default_mask_std
+        self.default_mask_strength = default_mask_strength
+        self.default_prompt_strength = default_prompt_strength
+        self.default_t_list = t_index_list
+        self.default_bootstrap_steps = default_bootstrap_steps
+        self.default_boostrap_mix_steps = default_boostrap_mix_steps
+        self.default_bootstrap_leak_sensitivity = default_bootstrap_leak_sensitivity
+        self.default_preprocess_mask_cover_alpha = default_preprocess_mask_cover_alpha
+        self.mask_type = mask_type
+        # Create model.
+        print(f'[INFO] Loading Stable Diffusion...')
+        variant = None
+        model_ckpt = None
+        lora_ckpt = None
+        lightning_repo = 'ByteDance/SDXL-Lightning'
+        if hf_key is not None:
+            print(f'[INFO] Using Hugging Face custom model key: {hf_key}')
+            model_key = hf_key
+            lora_ckpt = 'sdxl_lightning_4step_lora.safetensors'
+            self.pipe = StableDiffusionXLPipeline.from_pretrained(model_key, variant=variant, torch_dtype=self.dtype).to(self.device)
+            self.pipe.load_lora_weights(hf_hub_download(lightning_repo, lora_ckpt), adapter_name='lightning')
+            self.pipe.set_adapters(["lightning"], adapter_weights=[lora_weight])
+            self.pipe.fuse_lora()
+        else:
+            model_key = 'stabilityai/stable-diffusion-xl-base-1.0'
+            variant = 'fp16'
+            model_ckpt = "sdxl_lightning_4step_unet.safetensors" # Use the correct ckpt for your step setting!
+            unet = UNet2DConditionModel.from_config(model_key, subfolder='unet').to(self.device, self.dtype)
+            unet.load_state_dict(load_file(hf_hub_download(lightning_repo, model_ckpt), device=self.device))
+            self.pipe = StableDiffusionXLPipeline.from_pretrained(model_key, unet=unet, torch_dtype=self.dtype, variant=variant).to(self.device)
+        # Create model
+        if has_i2t:
+            self.i2t_processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
+            self.i2t_model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')
+        # Use SDXL-Lightning LoRA by default.
+        self.pipe.scheduler = EulerDiscreteScheduler.from_config(
+            self.pipe.scheduler.config, timestep_spacing="trailing")
+        self.scheduler = self.pipe.scheduler
+        self.default_num_inference_steps = 4
+        self.default_guidance_scale = 0.0
+        if t_index_list is None:
+            self.prepare_lightning_schedule(
+                list(range(self.default_num_inference_steps)),
+                self.default_num_inference_steps,
+            )
+        else:
+            self.prepare_lightning_schedule(t_index_list, 50)
+        self.vae = self.pipe.vae
+        self.tokenizer = self.pipe.tokenizer
+        self.tokenizer_2 = self.pipe.tokenizer_2
+        self.text_encoder = self.pipe.text_encoder
+        self.text_encoder_2 = self.pipe.text_encoder_2
+        self.unet = self.pipe.unet
+        self.vae_scale_factor = self.pipe.vae_scale_factor
+        # Prepare white background for bootstrapping.
+        self.get_white_background(1024, 1024)
+        print(f'[INFO] Model is loaded!')
+    def prepare_lightning_schedule(
+        self,
+        t_index_list: Optional[List[int]] = None,
+        num_inference_steps: Optional[int] = None,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+    ) -> None:
+        r"""Set up different inference schedule for the diffusion model.
+        You do not have to run this explicitly if you want to use the default
+        setting, but if you want other time schedules, run this function
+        between the module initialization and the main call.
+        Note:
+          - Recommended t_index_lists for LCMs:
+              - [0, 12, 25, 37]: Default schedule for 4 steps. Best for
+                  panorama. Not recommended if you want to use bootstrapping.
+                  Because bootstrapping stage affects the initial structuring
+                  of the generated image & in this four step LCM, this is done
+                  with only at the first step, the structure may be distorted.
+              - [0, 4, 12, 25, 37]: Recommended if you would use 1-step boot-
+                  strapping. Default initialization in this implementation.
+              - [0, 5, 16, 18, 20, 37]: Recommended if you would use 2-step
+                  bootstrapping.
+          - Due to the characteristic of SD1.5 LCM LoRA, setting
+            `num_inference_steps` larger than 20 may results in overly blurry
+            and unrealistic images. Beware!
+        Args:
+            t_index_list (Optional[List[int]]): The specified scheduling step
+                regarding the maximum timestep as `num_inference_steps`, which
+                is by default, 50. That means that
+                `t_index_list=[0, 12, 25, 37]` is a relative time indices basd
+                on the full scale of 50. If None, reinitialize the module with
+                the default value.
+            num_inference_steps (Optional[int]): The maximum timestep of the
+                sampler. Defines relative scale of the `t_index_list`. Rarely
+                used in practice. If None, reinitialize the module with the
+                default value.
+        """
+        if t_index_list is None:
+            t_index_list = self.default_t_list
+        if num_inference_steps is None:
+            num_inference_steps = self.default_num_inference_steps
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.timesteps = self.scheduler.timesteps[torch.tensor(t_index_list)]
+        # EulerDiscreteScheduler
+        self.sigmas = self.scheduler.sigmas[torch.tensor(t_index_list)]
+        self.sigmas_next = torch.cat([self.sigmas, self.sigmas.new_zeros(1)])[1:]
+        sigma_mask = torch.logical_and(s_tmin <= self.sigmas, self.sigmas <= s_tmax)
+        # self.gammas = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) * sigma_mask
+        self.gammas = min(s_churn / (num_inference_steps - 1), 2**0.5 - 1) * sigma_mask
+        self.sigma_hats = self.sigmas * (self.gammas + 1)
+        self.dt = self.sigmas_next - self.sigma_hats
+        noise_lvs = self.sigmas * (self.sigmas**2 + 1)**(-0.5)
+        self.noise_lvs = noise_lvs[None, :, None, None, None]
+        self.next_noise_lvs = torch.cat([noise_lvs[1:], noise_lvs.new_zeros(1)])[None, :, None, None, None]
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+                prompt_embeds_list.append(prompt_embeds)
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    @torch.no_grad()
+    def get_text_prompts(self, image: Image.Image) -> str:
+        r"""A convenient method to extract text prompt from an image.
+        This is called if the user does not provide background prompt but only
+        the background image. We use BLIP-2 to automatically generate prompts.
+        Args:
+            image (Image.Image): A PIL image.
+        Returns:
+            A single string of text prompt.
+        """
+        if hasattr(self, 'i2t_model'):
+            question = 'Question: What are in the image? Answer:'
+            inputs = self.i2t_processor(image, question, return_tensors='pt')
+            out = self.i2t_model.generate(**inputs, max_new_tokens=77)
+            prompt = self.i2t_processor.decode(out[0], skip_special_tokens=True).strip()
+            return prompt
+        else:
+            return ''
+    @torch.no_grad()
+    def encode_imgs(
+        self,
+        imgs: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        vae: Optional[nn.Module] = None,
+    ) -> torch.Tensor:
+        r"""A wrapper function for VAE encoder of the latent diffusion model.
+        Args:
+            imgs (torch.Tensor): An image to get StableDiffusion latents.
+                Expected shape: (B, 3, H, W). Expected pixel scale: [0, 1].
+            generator (Optional[torch.Generator]): Seed for KL-Autoencoder.
+            vae (Optional[nn.Module]): Explicitly specify VAE (used for
+                the demo application with TinyVAE).
+        Returns:
+            An image latent embedding with 1/8 size (depending on the auto-
+            encoder. Shape: (B, 4, H//8, W//8).
+        """
+        def _retrieve_latents(
+            encoder_output: torch.Tensor,
+            generator: Optional[torch.Generator] = None,
+            sample_mode: str = 'sample',
+        ):
+            if hasattr(encoder_output, 'latent_dist') and sample_mode == 'sample':
+                return encoder_output.latent_dist.sample(generator)
+            elif hasattr(encoder_output, 'latent_dist') and sample_mode == 'argmax':
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, 'latents'):
+                return encoder_output.latents
+            else:
+                raise AttributeError('Could not access latents of provided encoder_output')
+        vae = self.vae if vae is None else vae
+        imgs = 2 * imgs - 1
+        latents = vae.config.scaling_factor * _retrieve_latents(vae.encode(imgs), generator=generator)
+        return latents
+    @torch.no_grad()
+    def decode_latents(self, latents: torch.Tensor, vae: Optional[nn.Module] = None) -> torch.Tensor:
+        r"""A wrapper function for VAE decoder of the latent diffusion model.
+        Args:
+            latents (torch.Tensor): An image latent to get associated images.
+                Expected shape: (B, 4, H//8, W//8).
+            vae (Optional[nn.Module]): Explicitly specify VAE (used for
+                the demo application with TinyVAE).
+        Returns:
+            An image latent embedding with 1/8 size (depending on the auto-
+            encoder. Shape: (B, 3, H, W).
+        """
+        vae = self.vae if vae is None else vae
+        latents = 1 / vae.config.scaling_factor * latents
+        imgs = vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clip_(0, 1)
+        return imgs
+    @torch.no_grad()
+    def get_white_background(self, height: int, width: int) -> torch.Tensor:
+        r"""White background image latent for bootstrapping or in case of
+        absent background.
+        Additionally stores the maximally-sized white latent for fast retrieval
+        in the future. By default, we initially call this with 1024x1024 sized
+        white image, so the function is rarely visited twice.
+        Args:
+            height (int): The height of the white *image*, not its latent.
+            width (int): The width of the white *image*, not its latent.
+        Returns:
+            A white image latent of size (1, 4, height//8, width//8). A cropped
+            version of the stored white latent is returned if the requested
+            size is smaller than what we already have created.
+        """
+        if not hasattr(self, 'white') or self.white.shape[-2] < height or self.white.shape[-1] < width:
+            white = torch.ones(1, 3, height, width, dtype=self.dtype, device=self.device)
+            self.white = self.encode_imgs(white)
+            return self.white
+        return self.white[..., :(height // self.vae_scale_factor), :(width // self.vae_scale_factor)]
+    @torch.no_grad()
+    def process_mask(
+        self,
+        masks: Union[torch.Tensor, Image.Image, List[Image.Image]],
+        strength: Optional[Union[torch.Tensor, float]] = None,
+        std: Optional[Union[torch.Tensor, float]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        use_boolean_mask: bool = True,
+        timesteps: Optional[torch.Tensor] = None,
+        preprocess_mask_cover_alpha: Optional[float] = None,
+    ) -> Tuple[torch.Tensor]:
+        r"""Fast preprocess of masks for region-based generation with fine-
+        grained controls.
+        Mask preprocessing is done in four steps:
+         1. Resizing: Resize the masks into the specified width and height by
+            nearest neighbor interpolation.
+         2. (Optional) Ordering: Masks with higher indices are considered to
+            cover the masks with smaller indices. Covered masks are decayed
+            in its alpha value by the specified factor of
+            `preprocess_mask_cover_alpha`.
+         3. Blurring: Gaussian blur is applied to the mask with the specified
+            standard deviation (isotropic). This results in gradual increase of
+            masked region as the timesteps evolve, naturally blending fore-
+            ground and the predesignated background. Not strictly required if
+            you want to produce images from scratch withoout background.
+         4. Quantization: Split the real-numbered masks of value between [0, 1]
+            into predefined noise levels for each quantized scheduling step of
+            the diffusion sampler. For example, if the diffusion model sampler
+            has noise level of [0.9977, 0.9912, 0.9735, 0.8499, 0.5840], which
+            is the default noise level of this module with schedule [0, 4, 12,
+            25, 37], the masks are split into binary masks whose values are
+            greater than these levels. This results in tradual increase of mask
+            region as the timesteps increase. Details are described in our
+            paper at https://arxiv.org/pdf/2403.09055.pdf.
+        On the Three Modes of `mask_type`:
+            `self.mask_type` is predefined at the initialization stage of this
+            pipeline. Three possible modes are available: 'discrete', 'semi-
+            continuous', and 'continuous'. These define the mask quantization
+            modes we use. Basically, this (subtly) controls the smoothness of
+            foreground-background blending. Continuous modes produces nonbinary
+            masks to further blend foreground and background latents by linear-
+            ly interpolating between them. Semi-continuous masks only applies
+            continuous mask at the last step of the LCM sampler. Due to the
+            large step size of the LCM scheduler, we find that our continuous
+            blending helps generating seamless inpainting and editing results.
+        Args:
+            masks (Union[torch.Tensor, Image.Image, List[Image.Image]]): Masks.
+            strength (Optional[Union[torch.Tensor, float]]): Mask strength that
+                overrides the default value. A globally multiplied factor to
+                the mask at the initial stage of processing. Can be applied
+                seperately for each mask.
+            std (Optional[Union[torch.Tensor, float]]): Mask blurring Gaussian
+                kernel's standard deviation. Overrides the default value. Can
+                be applied seperately for each mask.
+            height (int): The height of the expected generation. Mask is
+                resized to (height//8, width//8) with nearest neighbor inter-
+                polation.
+            width (int): The width of the expected generation. Mask is resized
+                to (height//8, width//8) with nearest neighbor interpolation.
+            use_boolean_mask (bool): Specify this to treat the mask image as
+                a boolean tensor. The retion with dark part darker than 0.5 of
+                the maximal pixel value (that is, 127.5) is considered as the
+                designated mask.
+            timesteps (Optional[torch.Tensor]): Defines the scheduler noise
+                levels that acts as bins of mask quantization.
+            preprocess_mask_cover_alpha (Optional[float]): Optional pre-
+                processing where each mask covered by other masks is reduced in
+                its alpha value by this specified factor. Overrides the default
+                value.
+        Returns: A tuple of tensors.
+          - masks: Preprocessed (ordered, blurred, and quantized) binary/non-
+                binary masks (see the explanation on `mask_type` above) for
+                region-based image synthesis.
+          - masks_blurred: Gaussian blurred masks. Used for optionally
+                specified foreground-background blending after image
+                generation.
+          - std: Mask blur standard deviation. Used for optionally specified
+                foreground-background blending after image generation.
+        """
+        if isinstance(masks, Image.Image):
+            masks = [masks]
+        if isinstance(masks, (tuple, list)):
+            # Assumes white background for Image.Image;
+            # inverted boolean masks with shape (1, 1, H, W) for torch.Tensor.
+            if use_boolean_mask:
+                proc = lambda m: T.ToTensor()(m)[None, -1:] < 0.5
+            else:
+                proc = lambda m: 1.0 - T.ToTensor()(m)[None, -1:]
+            masks = torch.cat([proc(mask) for mask in masks], dim=0).float().clip_(0, 1)
+        masks = F.interpolate(masks.float(), size=(height, width), mode='bilinear', align_corners=False)
+        masks = masks.to(self.device)
+        # Background mask alpha is decayed by the specified factor where foreground masks covers it.
+        if preprocess_mask_cover_alpha is None:
+            preprocess_mask_cover_alpha = self.default_preprocess_mask_cover_alpha
+        if preprocess_mask_cover_alpha > 0:
+            masks = torch.stack([
+                torch.where(
+                    masks[i + 1:].sum(dim=0) > 0,
+                    mask * preprocess_mask_cover_alpha,
+                    mask,
+                ) if i < len(masks) - 1 else mask
+                for i, mask in enumerate(masks)
+            ], dim=0)
+        # Scheduler noise levels for mask quantization.
+        if timesteps is None:
+            noise_lvs = self.noise_lvs
+            next_noise_lvs = self.next_noise_lvs
+        else:
+            noise_lvs_ = self.sigmas * (self.sigmas**2 + 1)**(-0.5)
+            # noise_lvs_ = (1 - self.scheduler.alphas_cumprod[timesteps].to(self.device)) ** 0.5
+            noise_lvs = noise_lvs_[None, :, None, None, None].to(masks.device)
+            next_noise_lvs = torch.cat([noise_lvs_[1:], noise_lvs_.new_zeros(1)])[None, :, None, None, None]
+        # Mask preprocessing parameters are fetched from the default settings.
+        if std is None:
+            std = self.default_mask_std
+        if isinstance(std, (int, float)):
+            std = [std] * len(masks)
+        if isinstance(std, (list, tuple)):
+            std = torch.as_tensor(std, dtype=torch.float, device=self.device)
+        if strength is None:
+            strength = self.default_mask_strength
+        if isinstance(strength, (int, float)):
+            strength = [strength] * len(masks)
+        if isinstance(strength, (list, tuple)):
+            strength = torch.as_tensor(strength, dtype=torch.float, device=self.device)
+        if (std > 0).any():
+            std = torch.where(std > 0, std, 1e-5)
+            masks = gaussian_lowpass(masks, std)
+        masks_blurred = masks
+        # NOTE: This `strength` aligns with `denoising strength`. However, with LCM, using strength < 0.96
+        #       gives unpleasant results.
+        masks = masks * strength[:, None, None, None]
+        masks = masks.unsqueeze(1).repeat(1, noise_lvs.shape[1], 1, 1, 1)
+        # Mask is quantized according to the current noise levels specified by the scheduler.
+        if self.mask_type == 'discrete':
+            # Discrete mode.
+            masks = masks > noise_lvs
+        elif self.mask_type == 'semi-continuous':
+            # Semi-continuous mode (continuous at the last step only).
+            masks = torch.cat((
+                masks[:, :-1] > noise_lvs[:, :-1],
+                (
+                    (masks[:, -1:] - next_noise_lvs[:, -1:]) / (noise_lvs[:, -1:] - next_noise_lvs[:, -1:])
+                ).clip_(0, 1),
+            ), dim=1)
+        elif self.mask_type == 'continuous':
+            # Continuous mode: Have the exact same `1` coverage with discrete mode, but the mask gradually
+            #                  decreases continuously after the discrete mode boundary to become `0` at the
+            #                  next lower threshold.
+            masks = ((masks - next_noise_lvs) / (noise_lvs - next_noise_lvs)).clip_(0, 1)
+        # NOTE: Post processing mask strength does not align with conventional 'denoising_strength'. However,
+        #       fine-grained mask alpha channel tuning is available with this form.
+        # masks = masks * strength[None, :, None, None, None]
+        h = height // self.vae_scale_factor
+        w = width // self.vae_scale_factor
+        masks = rearrange(masks.float(), 'p t () h w -> (p t) () h w')
+        masks = F.interpolate(masks, size=(h, w), mode='nearest')
+        masks = rearrange(masks.to(self.dtype), '(p t) () h w -> p t () h w', p=len(std))
+        return masks, masks_blurred, std
+    def scheduler_scale_model_input(
+        self,
+        latent: torch.FloatTensor,
+        idx: int,
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        latent = latent / ((self.sigmas[idx]**2 + 1) ** 0.5)
+        return latent
+    def scheduler_step(
+        self,
+        noise_pred: torch.Tensor,
+        idx: int,
+        latent: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""Denoise-only step for reverse diffusion scheduler.
+        Designed to match the interface of the original `pipe.scheduler.step`,
+        which is a combination of this method and the following
+        `scheduler_add_noise`.
+        Args:
+            noise_pred (torch.Tensor): Noise prediction results from the U-Net.
+            idx (int): Instead of timesteps (in [0, 1000]-scale) use indices
+                for the timesteps tensor (ranged in [0, len(timesteps)-1]).
+            latent (torch.Tensor): Noisy latent.
+        Returns:
+            A denoised tensor with the same size as latent.
+        """
+        # Upcast to avoid precision issues when computing prev_sample.
+        latent = latent.to(torch.float32)
+        # 1. Compute predicted original sample (x_0) from sigma-scaled predicted noise.
+        assert self.scheduler.config.prediction_type == 'epsilon', 'Only supports `prediction_type` of `epsilon` for now.'
+        # pred_original_sample = latent - self.sigma_hats[idx] * noise_pred
+        # prev_sample = pred_original_sample + noise_pred * (self.dt[i] + self.sigma_hats[i])
+        # return pred_original_sample.to(self.dtype)
+        # 2. Convert to an ODE derivative.
+        prev_sample = latent + noise_pred * self.dt[idx]
+        return prev_sample.to(self.dtype)
+    def scheduler_add_noise(
+        self,
+        latent: torch.Tensor,
+        noise: Optional[torch.Tensor],
+        idx: int,
+        s_noise: float = 1.0,
+        initial: bool = False,
+    ) -> torch.Tensor:
+        r"""Separated noise-add step for the reverse diffusion scheduler.
+        Designed to match the interface of the original
+        `pipe.scheduler.add_noise`.
+        Args:
+            latent (torch.Tensor): Denoised latent.
+            noise (torch.Tensor): Added noise. Can be None. If None, a random
+                noise is newly sampled for addition.
+            idx (int): Instead of timesteps (in [0, 1000]-scale) use indices
+                for the timesteps tensor (ranged in [0, len(timesteps)-1]).
+        Returns:
+            A noisy tensor with the same size as latent.
+        """
+        if initial:
+            if idx < len(self.sigmas) and idx >= 0:
+                noise = torch.randn_like(latent) if noise is None else noise
+                return latent + self.sigmas[idx] * noise
+            else:
+                return latent
+        else:
+            # 3. Post-add noise.
+            noise_lv = (self.sigma_hats[idx]**2 - self.sigmas[idx]**2) ** 0.5
+            if self.gammas[idx] > 0 and noise_lv > 0 and s_noise > 0 and idx < len(self.sigmas) and idx >= 0:
+                noise = torch.randn_like(latent) if noise is None else noise
+                eps = noise * s_noise * noise_lv
+                latent = latent + eps
+                # pred_original_sample = pred_original_sample + eps
+            return latent
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        negative_prompts: Union[str, List[str]] = '',
+        suffix: Optional[str] = None, #', background is ',
+        background: Optional[Union[torch.Tensor, Image.Image]] = None,
+        background_prompt: Optional[str] = None,
+        background_negative_prompt: str = '',
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        prompt_strengths: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        masks: Optional[Union[Image.Image, List[Image.Image]]] = None,
+        mask_strengths: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        mask_stds: Optional[Union[torch.Tensor, float, List[float]]] = None,
+        use_boolean_mask: bool = True,
+        do_blend: bool = True,
+        tile_size: int = 1024,
+        bootstrap_steps: Optional[int] = None,
+        boostrap_mix_steps: Optional[float] = None,
+        bootstrap_leak_sensitivity: Optional[float] = None,
+        preprocess_mask_cover_alpha: Optional[float] = None,
+    ) -> Image.Image:
+        r"""Arbitrary-size image generation from multiple pairs of (regional)
+        text prompt-mask pairs.
+        This is a main routine for this pipeline.
+        Example:
+            >>> device = torch.device('cuda:0')
+            >>> smd = StableMultiDiffusionPipeline(device)
+            >>> prompts = {... specify prompts}
+            >>> masks = {... specify mask tensors}
+            >>> height, width = masks.shape[-2:]
+            >>> image = smd(
+            >>>     prompts, masks=masks.float(), height=height, width=width)
+            >>> image.save('my_beautiful_creation.png')
+        Args:
+            prompts (Union[str, List[str]]): A text prompt.
+            negative_prompts (Union[str, List[str]]): A negative text prompt.
+            suffix (Optional[str]): One option for blending foreground prompts
+                with background prompts by simply appending background prompt
+                to the end of each foreground prompt with this `middle word` in
+                between. For example, if you set this as `, background is`,
+                then the foreground prompt will be changed into
+                `(fg), background is (bg)` before conditional generation.
+            background (Optional[Union[torch.Tensor, Image.Image]]): a
+                background image, if the user wants to draw in front of the
+                specified image. Background prompt will automatically generated
+                with a BLIP-2 model.
+            background_prompt (Optional[str]): The background prompt is used
+                for preprocessing foreground prompt embeddings to blend
+                foreground and background.
+            background_negative_prompt (Optional[str]): The negative background
+                prompt.
+            height (int): Height of a generated image. It is tiled if larger
+                than `tile_size`.
+            width (int): Width of a generated image. It is tiled if larger
+                than `tile_size`.
+            num_inference_steps (Optional[int]): Number of inference steps.
+                Default inference scheduling is used if none is specified.
+            guidance_scale (Optional[float]): Classifier guidance scale.
+                Default value is used if none is specified.
+            prompt_strength (float): Overrides default value. Preprocess
+                foreground prompts globally by linearly interpolating its
+                embedding with the background prompt embeddint with specified
+                mix ratio. Useful control handle for foreground blending.
+                Recommended range: 0.5-1.
+            masks (Optional[Union[Image.Image, List[Image.Image]]]): a list of
+                mask images. Each mask associates with each of the text prompts
+                and each of the negative prompts. If specified as an image, it
+                regards the image as a boolean mask. Also accepts torch.Tensor
+                masks, which can have nonbinary values for fine-grained
+                controls in mixing regional generations.
+            mask_strengths (Optional[Union[torch.Tensor, float, List[float]]]):
+                Overrides the default value. an be assigned for each mask
+                separately. Preprocess mask by multiplying it globally with the
+                specified variable. Caution: extremely sensitive. Recommended
+                range: 0.98-1.
+            mask_stds (Optional[Union[torch.Tensor, float, List[float]]]):
+                Overrides the default value. Can be assigned for each mask
+                separately. Preprocess mask with Gaussian blur with specified
+                standard deviation. Recommended range: 0-64.
+            use_boolean_mask (bool): Turn this off if you want to treat the
+                mask image as nonbinary one. The module will use the last
+                channel of the given image in `masks` as the mask value.
+            do_blend (bool): Blend the generated foreground and the optionally
+                predefined background by smooth boundary obtained from Gaussian
+                blurs of the foreground `masks` with the given `mask_stds`.
+            tile_size (Optional[int]): Tile size of the panorama generation.
+                Works best with the default training size of the Stable-
+                Diffusion model, i.e., 1024 or 1024 for SD1.5 and 1024 for SDXL.
+            bootstrap_steps (int): Overrides the default value. Bootstrapping
+                stage steps to encourage region separation. Recommended range:
+                1-3.
+            boostrap_mix_steps (float): Overrides the default value.
+                Bootstrapping background is a linear interpolation between
+                background latent and the white image latent. This handle
+                controls the mix ratio. Available range: 0-(number of
+                bootstrapping inference steps). For example, 2.3 means that for
+                the first two steps, white image is used as a bootstrapping
+                background and in the third step, mixture of white (0.3) and
+                registered background (0.7) is used as a bootstrapping
+                background.
+            bootstrap_leak_sensitivity (float): Overrides the default value.
+                Postprocessing at each inference step by masking away the
+                remaining bootstrap backgrounds t Recommended range: 0-1.
+            preprocess_mask_cover_alpha (float): Overrides the default value.
+                Optional preprocessing where each mask covered by other masks
+                is reduced in its alpha value by this specified factor.
+        Returns: A PIL.Image image of a panorama (large-size) image.
+        """
+        ### Simplest cases
+        # prompts is None: return background.
+        # masks is None but prompts is not None: return prompts
+        # masks is not None and prompts is not None: Do StableMultiDiffusion.
+        if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
+            if background is None and background_prompt is not None:
+                return sample(background_prompt, background_negative_prompt, height, width, num_inference_steps, guidance_scale)
+            return background
+        elif masks is None or (isinstance(masks, (list, tuple)) and len(masks) == 0):
+            return sample(prompts, negative_prompts, height, width, num_inference_steps, guidance_scale)
+        ### Prepare generation
+        if num_inference_steps is not None:
+            # self.prepare_lcm_schedule(list(range(num_inference_steps)), num_inference_steps)
+            self.prepare_lightning_schedule(list(range(num_inference_steps)), num_inference_steps)
+        if guidance_scale is None:
+            guidance_scale = self.default_guidance_scale
+        do_classifier_free_guidance = guidance_scale > 1.0
+        ### Prompts & Masks
+        # asserts #m > 0 and #p > 0.
+        # #m == #p == #n > 0: We happily generate according to the prompts & masks.
+        # #m != #p: #p should be 1 and we will broadcast text embeds of p through m masks.
+        # #p != #n: #n should be 1 and we will broadcast negative embeds n through p prompts.
+        if isinstance(masks, Image.Image):
+            masks = [masks]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        num_masks = len(masks)
+        num_prompts = len(prompts)
+        num_nprompts = len(negative_prompts)
+        assert num_prompts in (num_masks, 1), \
+            f'The number of prompts {num_prompts} should match the number of masks {num_masks}!'
+        assert num_nprompts in (num_prompts, 1), \
+            f'The number of negative prompts {num_nprompts} should match the number of prompts {num_prompts}!'
+        fg_masks, masks_g, std = self.process_mask(
+            masks,
+            mask_strengths,
+            mask_stds,
+            height=height,
+            width=width,
+            use_boolean_mask=use_boolean_mask,
+            timesteps=self.timesteps,
+            preprocess_mask_cover_alpha=preprocess_mask_cover_alpha,
+        )  # (p, t, 1, H, W)
+        bg_masks = (1 - fg_masks.sum(dim=0)).clip_(0, 1)  # (T, 1, h, w)
+        has_background = bg_masks.sum() > 0
+        h = (height + self.vae_scale_factor - 1) // self.vae_scale_factor
+        w = (width + self.vae_scale_factor - 1) // self.vae_scale_factor
+        ### Background
+        # background == None && background_prompt == None: Initialize with white background.
+        # background == None && background_prompt != None: Generate background *along with other prompts*.
+        # background != None && background_prompt == None: Retrieve text prompt using BLIP.
+        # background != None && background_prompt != None: Use the given arguments.
+        # not has_background: no effect of prompt_strength (the mix ratio between fg prompt & bg prompt)
+        # has_background && prompt_strength != 1: mix only for this case.
+        bg_latent = None
+        if has_background:
+            if background is None and background_prompt is not None:
+                fg_masks = torch.cat((bg_masks[None], fg_masks), dim=0)
+                if suffix is not None:
+                    prompts = [p + suffix + background_prompt for p in prompts]
+                prompts = [background_prompt] + prompts
+                negative_prompts = [background_negative_prompt] + negative_prompts
+                has_background = False # Regard that background does not exist.
+            else:
+                if background is None and background_prompt is None:
+                    background = torch.ones(1, 3, height, width, dtype=self.dtype, device=self.device)
+                    background_prompt = 'simple white background image'
+                elif background is not None and background_prompt is None:
+                    background_prompt = self.get_text_prompts(background)
+                if suffix is not None:
+                    prompts = [p + suffix + background_prompt for p in prompts]
+                prompts = [background_prompt] + prompts
+                negative_prompts = [background_negative_prompt] + negative_prompts
+                if isinstance(background, Image.Image):
+                    background = T.ToTensor()(background).to(dtype=self.dtype, device=self.device)[None]
+                background = F.interpolate(background, size=(height, width), mode='bicubic', align_corners=False)
+                bg_latent = self.encode_imgs(background)
+        # Bootstrapping stage preparation.
+        if bootstrap_steps is None:
+            bootstrap_steps = self.default_bootstrap_steps
+        if boostrap_mix_steps is None:
+            boostrap_mix_steps = self.default_boostrap_mix_steps
+        if bootstrap_leak_sensitivity is None:
+            bootstrap_leak_sensitivity = self.default_bootstrap_leak_sensitivity
+        if bootstrap_steps > 0:
+            height_ = min(height, tile_size)
+            width_ = min(width, tile_size)
+            white = self.get_white_background(height, width) # (1, 4, h, w)
+        ### Prepare text embeddings (optimized for the minimal encoder batch size)
+        # SDXL pipeline settings.
+        batch_size = 1
+        output_type = 'pil'
+        guidance_rescale = 0.7
+        prompt_2 = None
+        device = self.device
+        num_images_per_prompt = 1
+        negative_prompt_2 = None
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        negative_crops_coords_top_left = (0, 0)
+        negative_original_size = None
+        negative_target_size = None
+        pooled_prompt_embeds = None
+        negative_pooled_prompt_embeds = None
+        text_encoder_lora_scale = None
+        prompt_embeds = None
+        negative_prompt_embeds = None
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompts,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompts,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if has_background:
+            # First channel is background prompt text embeds. Background prompt itself is not used for generation.
+            s = prompt_strengths
+            if prompt_strengths is None:
+                s = self.default_prompt_strength
+            if isinstance(s, (int, float)):
+                s = [s] * num_prompts
+            if isinstance(s, (list, tuple)):
+                assert len(s) == num_prompts, \
+                    f'The number of prompt strengths {len(s)} should match the number of prompts {num_prompts}!'
+                s = torch.as_tensor(s, dtype=self.dtype, device=self.device)
+            s = s[:, None, None]
+            be = prompt_embeds[:1]
+            fe = prompt_embeds[1:]
+            prompt_embeds = torch.lerp(be, fe, s)  # (p, 77, 1024)
+            if negative_prompt_embeds is not None:
+                bu = negative_prompt_embeds[:1]
+                fu = negative_prompt_embeds[1:]
+                if num_prompts > num_nprompts:
+                    # # negative prompts = 1; # prompts > 1.
+                    assert fu.shape[0] == 1 and fe.shape == num_prompts
+                    fu = fu.repeat(num_prompts, 1, 1)
+                negative_prompt_embeds = torch.lerp(bu, fu, s)  # (n, 77, 1024)
+        elif negative_prompt_embeds is not None and num_prompts > num_nprompts:
+            # # negative prompts = 1; # prompts > 1.
+            assert negative_prompt_embeds.shape[0] == 1 and prompt_embeds.shape[0] == num_prompts
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_prompts, 1, 1)
+        # assert negative_prompt_embeds.shape[0] == prompt_embeds.shape[0] == num_prompts
+        if num_masks > num_prompts:
+            assert masks.shape[0] == num_masks and num_prompts == 1
+            prompt_embeds = prompt_embeds.repeat(num_masks, 1, 1)
+            if negative_prompt_embeds is not None:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(num_masks, 1, 1)
+        # SDXL pipeline settings.
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        del negative_prompt_embeds, negative_pooled_prompt_embeds, negative_add_time_ids
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        ### Run
+        # Latent initialization.
+        if self.timesteps[0] < 999 and has_background:
+            latents = self.scheduler_add_noise(bg_latents, None, 0, initial=True)
+        else:
+            latents = torch.randn((1, self.unet.config.in_channels, h, w), dtype=self.dtype, device=self.device)
+            latents = latents * self.scheduler.init_noise_sigma
+        # Tiling (if needed).
+        if height > tile_size or width > tile_size:
+            t = (tile_size + self.vae_scale_factor - 1) // self.vae_scale_factor
+            views, tile_masks = get_panorama_views(h, w, t)
+            tile_masks = tile_masks.to(self.device)
+        else:
+            views = [(0, h, 0, w)]
+            tile_masks = latents.new_ones((1, 1, h, w))
+        value = torch.zeros_like(latents)
+        count_all = torch.zeros_like(latents)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(tqdm(self.timesteps)):
+                fg_mask = fg_masks[:, i]
+                bg_mask = bg_masks[i:i + 1]
+                value.zero_()
+                count_all.zero_()
+                for j, (h_start, h_end, w_start, w_end) in enumerate(views):
+                    fg_mask_ = fg_mask[..., h_start:h_end, w_start:w_end]
+                    latents_ = latents[..., h_start:h_end, w_start:w_end].repeat(num_masks, 1, 1, 1)
+                    # Additional arguments for the SDXL pipeline.
+                    add_time_ids_input = add_time_ids.clone()
+                    add_time_ids_input[:, 2] = h_start * self.vae_scale_factor
+                    add_time_ids_input[:, 3] = w_start * self.vae_scale_factor
+                    add_time_ids_input = add_time_ids_input.repeat_interleave(num_prompts, dim=0)
+                    # Bootstrap for tight background.
+                    if i < bootstrap_steps:
+                        mix_ratio = min(1, max(0, boostrap_mix_steps - i))
+                        # Treat the first foreground latent as the background latent if one does not exist.
+                        bg_latents_ = bg_latents[..., h_start:h_end, w_start:w_end] if has_background else latents_[:1]
+                        white_ = white[..., h_start:h_end, w_start:w_end]
+                        white_ = self.scheduler_add_noise(white_, None, i, initial=True)
+                        bg_latents_ = mix_ratio * white_ + (1.0 - mix_ratio) * bg_latents_
+                        latents_ = (1.0 - fg_mask_) * bg_latents_ + fg_mask_ * latents_
+                        # Centering.
+                        latents_ = shift_to_mask_bbox_center(latents_, fg_mask_, reverse=True)
+                    latent_model_input = torch.cat([latents_] * 2) if do_classifier_free_guidance else latents_
+                    latent_model_input = self.scheduler_scale_model_input(latent_model_input, i)
+                    # Perform one step of the reverse diffusion.
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids_input}
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=None,
+                        cross_attention_kwargs=None,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    if do_classifier_free_guidance and guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=guidance_rescale)
+                    latents_ = self.scheduler_step(noise_pred, i, latents_)
+                    if i < bootstrap_steps:
+                        # Uncentering.
+                        latents_ = shift_to_mask_bbox_center(latents_, fg_mask_)
+                        # Remove leakage (optional).
+                        leak = (latents_ - bg_latents_).pow(2).mean(dim=1, keepdim=True)
+                        leak_sigmoid = torch.sigmoid(leak / bootstrap_leak_sensitivity) * 2 - 1
+                        fg_mask_ = fg_mask_ * leak_sigmoid
+                    # Mix the latents.
+                    fg_mask_ = fg_mask_ * tile_masks[:, j:j + 1, h_start:h_end, w_start:w_end]
+                    value[..., h_start:h_end, w_start:w_end] += (fg_mask_ * latents_).sum(dim=0, keepdim=True)
+                    count_all[..., h_start:h_end, w_start:w_end] += fg_mask_.sum(dim=0, keepdim=True)
+                latents = torch.where(count_all > 0, value / count_all, value)
+                bg_mask = (1 - count_all).clip_(0, 1)  # (T, 1, h, w)
+                if has_background:
+                    latents = (1 - bg_mask) * latents + bg_mask * bg_latents
+                # Noise is added after mixing.
+                if i < len(self.timesteps) - 1:
+                    latents = self.scheduler_add_noise(latents, None, i + 1)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        # Return PIL Image.
+        image = image[0].clip_(-1, 1) * 0.5 + 0.5
+        if has_background and do_blend:
+            fg_mask = torch.sum(masks_g, dim=0).clip_(0, 1)
+            image = blend(image, background[0], fg_mask)
+        else:
+            image = T.ToPILImage()(image)
+        return image

prompt_util.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import Dict, List, Tuple, Union
+quality_prompt_list = [
+    {
+        "name": "(None)",
+        "prompt": "{prompt}",
+        "negative_prompt": "nsfw, lowres",
+    },
+    {
+        "name": "Standard v3.0",
+        "prompt": "{prompt}, masterpiece, best quality",
+        "negative_prompt": "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name",
+    },
+    {
+        "name": "Standard v3.1",
+        "prompt": "{prompt}, masterpiece, best quality, very aesthetic, absurdres",
+        "negative_prompt": "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+    },
+    {
+        "name": "Light v3.1",
+        "prompt": "{prompt}, (masterpiece), best quality, very aesthetic, perfect face",
+        "negative_prompt": "nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
+    },
+    {
+        "name": "Heavy v3.1",
+        "prompt": "{prompt}, (masterpiece), (best quality), (ultra-detailed), very aesthetic, illustration, disheveled hair, perfect composition, moist skin, intricate details",
+        "negative_prompt": "nsfw, longbody, lowres, bad anatomy, bad hands, missing fingers, pubic hair, extra digit, fewer digits, cropped, worst quality, low quality, very displeasing",
+    },
+]
+style_list = [
+    {
+        "name": "(None)",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+    {
+        "name": "Cinematic",
+        "prompt": "{prompt}, cinematic still, emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
+        "negative_prompt": "nsfw, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
+    },
+    {
+        "name": "Photographic",
+        "prompt": "{prompt}, cinematic photo, 35mm photograph, film, bokeh, professional, 4k, highly detailed",
+        "negative_prompt": "nsfw, drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+    },
+    {
+        "name": "Anime",
+        "prompt": "{prompt}, anime artwork, anime style, key visual, vibrant, studio anime, highly detailed",
+        "negative_prompt": "nsfw, photo, deformed, black and white, realism, disfigured, low contrast",
+    },
+    {
+        "name": "Manga",
+        "prompt": "{prompt}, manga style, vibrant, high-energy, detailed, iconic, Japanese comic style",
+        "negative_prompt": "nsfw, ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
+    },
+    {
+        "name": "Digital Art",
+        "prompt": "{prompt}, concept art, digital artwork, illustrative, painterly, matte painting, highly detailed",
+        "negative_prompt": "nsfw, photo, photorealistic, realism, ugly",
+    },
+    {
+        "name": "Pixel art",
+        "prompt": "{prompt}, pixel-art, low-res, blocky, pixel art style, 8-bit graphics",
+        "negative_prompt": "nsfw, sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
+    },
+    {
+        "name": "Fantasy art",
+        "prompt": "{prompt}, ethereal fantasy concept art, magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
+        "negative_prompt": "nsfw, photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
+    },
+    {
+        "name": "Neonpunk",
+        "prompt": "{prompt}, neonpunk style, cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
+        "negative_prompt": "nsfw, painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+    },
+    {
+        "name": "3D Model",
+        "prompt": "{prompt}, professional 3d model, octane render, highly detailed, volumetric, dramatic lighting",
+        "negative_prompt": "nsfw, ugly, deformed, noisy, low poly, blurry, painting",
+    },
+]
+_style_dict = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+_quality_dict = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in quality_prompt_list}
+def preprocess_prompt(
+    positive: str,
+    negative: str = "",
+    style_dict: Dict[str, dict] = _quality_dict,
+    style_name: str = "Standard v3.1", # "Heavy v3.1"
+    add_style: bool = True,
+) -> Tuple[str, str]:
+    p, n = style_dict.get(style_name, style_dict["(None)"])
+    if add_style and positive.strip():
+        formatted_positive = p.format(prompt=positive)
+    else:
+        formatted_positive = positive
+    combined_negative = n
+    if negative.strip():
+        if combined_negative:
+            combined_negative += ", " + negative
+        else:
+            combined_negative = negative
+    return formatted_positive, combined_negative
+def preprocess_prompts(
+    positives: List[str],
+    negatives: List[str] = None,
+    style_dict = _style_dict,
+    style_name: str = "Manga", # "(None)"
+    quality_dict = _quality_dict,
+    quality_name: str = "Standard v3.1", # "Heavy v3.1"
+    add_style: bool = True,
+    add_quality_tags = True,
+) -> Tuple[List[str], List[str]]:
+    if negatives is None:
+        negatives = ['' for _ in positives]
+    positives_ = []
+    negatives_ = []
+    for pos, neg in zip(positives, negatives):
+        pos, neg = preprocess_prompt(pos, neg, quality_dict, quality_name, add_quality_tags)
+        pos, neg = preprocess_prompt(pos, neg, style_dict, style_name, add_style)
+        positives_.append(pos)
+        negatives_.append(neg)
+    return positives_, negatives_
+def print_prompts(
+    positives: Union[str, List[str]],
+    negatives: Union[str, List[str]],
+    has_background: bool = False,
+) -> None:
+    if isinstance(positives, str):
+        positives = [positives]
+    if isinstance(negatives, str):
+        negatives = [negatives]
+    for i, prompt in enumerate(positives):
+        prefix = ((f'Prompt{i}' if i > 0 else 'Background Prompt')
+                  if has_background else f'Prompt{i + 1}')
+        print(prefix + ': ' + prompt)
+    for i, prompt in enumerate(negatives):
+        prefix = ((f'Negative Prompt{i}' if i > 0 else 'Background Negative Prompt')
+                  if has_background else f'Negative Prompt{i + 1}')
+        print(prefix + ': ' + prompt)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.0.1
+torchvision
+xformers==0.0.22
+einops
+diffusers
+transformers
+huggingface_hub[torch]
+gradio
+Pillow
+emoji
+numpy
+tqdm
+jupyterlab
+spaces

share_btn.py ADDED Viewed

	@@ -0,0 +1,59 @@

+community_icon_html = """<svg id="share-btn-share-icon" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32">
+    <path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path>
+    <path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path>
+</svg>"""
+loading_icon_html = """<svg id="share-btn-loading-icon" style="display:none;" class="animate-spin"
+   style="color: #ffffff;
+"
+   xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" fill="none" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><circle style="opacity: 0.25;" cx="12" cy="12" r="10" stroke="white" stroke-width="4"></circle><path style="opacity: 0.75;" fill="white" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg>"""
+share_js = """async () => {
+	async function uploadFile(file){
+		const UPLOAD_URL = 'https://huggingface.co/uploads';
+		const response = await fetch(UPLOAD_URL, {
+			method: 'POST',
+			headers: {
+				'Content-Type': file.type,
+				'X-Requested-With': 'XMLHttpRequest',
+			},
+			body: file, /// <- File inherits from Blob
+		});
+		const url = await response.text();
+		return url;
+	}
+    const gradioEl = document.querySelector('body > gradio-app');
+    const imgEls = gradioEl.querySelectorAll('#output-screen img');
+    const shareBtnEl = gradioEl.querySelector('#share-btn');
+    const shareIconEl = gradioEl.querySelector('#share-btn-share-icon');
+    const loadingIconEl = gradioEl.querySelector('#share-btn-loading-icon');
+    if(!imgEls.length){
+        return;
+    };
+    shareBtnEl.style.pointerEvents = 'none';
+    shareIconEl.style.display = 'none';
+    loadingIconEl.style.removeProperty('display');
+    const files = await Promise.all(
+        [...imgEls].map(async (imgEl) => {
+            const res = await fetch(imgEl.src);
+            const blob = await res.blob();
+            const imgId = Date.now() % 200;
+            const fileName = `diffuse-the-rest-${{imgId}}.jpg`;
+            return new File([blob], fileName, { type: 'image/jpeg' });
+        })
+    );
+    const urls = await Promise.all(files.map((f) => uploadFile(f)));
+	const htmlImgs = urls.map(url => `<img src='${url}' width='2560' height='1024'>`);
+	const descriptionMd = `<div style='display: flex; flex-wrap: wrap; column-gap: 0.75rem;'>
+${htmlImgs.join(`\n`)}
+</div>`;
+    const params = new URLSearchParams({
+        title: <p>My creation</p>,
+        description: descriptionMd,
+    });
+	const paramsStr = params.toString();
+	window.open(`https://huggingface.co/spaces/ironjr/SemanticPaletteXL/discussions/new?${paramsStr}`, '_blank');
+    shareBtnEl.style.removeProperty('pointer-events');
+    shareIconEl.style.removeProperty('display');
+    loadingIconEl.style.display = 'none';
+}"""

util.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) 2024 Jaerin Lee
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import concurrent.futures
+import time
+from typing import Any, Callable, List, Literal, Tuple, Union
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.cuda.amp as amp
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from diffusers import (
+    DiffusionPipeline,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+)
+def seed_everything(seed: int) -> None:
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+def load_model(
+    model_key: str,
+    sd_version: Literal['1.5', 'xl'],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    if model_key.endswith('.safetensors'):
+        if sd_version == '1.5':
+            pipeline = StableDiffusionPipeline
+        elif sd_version == 'xl':
+            pipeline = StableDiffusionXLPipeline
+        else:
+            raise ValueError(f'Stable Diffusion version {sd_version} not supported.')
+        return pipeline.from_single_file(model_key, torch_dtype=dtype).to(device)
+    try:
+        return DiffusionPipeline.from_pretrained(model_key, variant='fp16', torch_dtype=dtype).to(device)
+    except:
+        return DiffusionPipeline.from_pretrained(model_key, variant=None, torch_dtype=dtype).to(device)
+def get_cutoff(cutoff: float = None, scale: float = None) -> float:
+    if cutoff is not None:
+        return cutoff
+    if scale is not None and cutoff is None:
+        return 0.5 / scale
+    raise ValueError('Either one of `cutoff`, or `scale` should be specified.')
+def get_scale(cutoff: float = None, scale: float = None) -> float:
+    if scale is not None:
+        return scale
+    if cutoff is not None and scale is None:
+        return 0.5 / cutoff
+    raise ValueError('Either one of `cutoff`, or `scale` should be specified.')
+def filter_2d_by_kernel_1d(x: torch.Tensor, k: torch.Tensor) -> torch.Tensor:
+    assert len(k.shape) in (1,), 'Kernel size should be one of (1,).'
+    #  assert len(k.shape) in (1, 2), 'Kernel size should be one of (1, 2).'
+    b, c, h, w = x.shape
+    ks = k.shape[-1]
+    k = k.view(1, 1, -1).repeat(c, 1, 1)
+    x = x.permute(0, 2, 1, 3)
+    x = x.reshape(b * h, c, w)
+    x = F.pad(x, (ks // 2, (ks - 1) // 2), mode='replicate')
+    x = F.conv1d(x, k, groups=c)
+    x = x.reshape(b, h, c, w).permute(0, 3, 2, 1).reshape(b * w, c, h)
+    x = F.pad(x, (ks // 2, (ks - 1) // 2), mode='replicate')
+    x = F.conv1d(x, k, groups=c)
+    x = x.reshape(b, w, c, h).permute(0, 2, 3, 1)
+    return x
+def filter_2d_by_kernel_2d(x: torch.Tensor, k: torch.Tensor) -> torch.Tensor:
+    assert len(k.shape) in (2, 3), 'Kernel size should be one of (2, 3).'
+    x = F.pad(x, (
+        k.shape[-2] // 2, (k.shape[-2] - 1) // 2,
+        k.shape[-1] // 2, (k.shape[-1] - 1) // 2,
+    ), mode='replicate')
+    b, c, _, _ = x.shape
+    if len(k.shape) == 2 or (len(k.shape) == 3 and k.shape[0] == 1):
+        k = k.view(1, 1, *k.shape[-2:]).repeat(c, 1, 1, 1)
+        x = F.conv2d(x, k, groups=c)
+    elif len(k.shape) == 3:
+        assert k.shape[0] == b, \
+            'The number of kernels should match the batch size.'
+        k = k.unsqueeze(1)
+        x = F.conv2d(x.permute(1, 0, 2, 3), k, groups=b).permute(1, 0, 2, 3)
+    return x
+@amp.autocast(False)
+def filter_by_kernel(
+    x: torch.Tensor,
+    k: torch.Tensor,
+    is_batch: bool = False,
+) -> torch.Tensor:
+    k_dim = len(k.shape)
+    if k_dim == 1 or k_dim == 2 and is_batch:
+        return filter_2d_by_kernel_1d(x, k)
+    elif k_dim == 2 or k_dim == 3 and is_batch:
+        return filter_2d_by_kernel_2d(x, k)
+    else:
+        raise ValueError('Kernel size should be one of (1, 2, 3).')
+def gen_gauss_lowpass_filter_2d(
+    std: torch.Tensor,
+    window_size: int = None,
+) -> torch.Tensor:
+    # Gaussian kernel size is odd in order to preserve the center.
+    if window_size is None:
+        window_size = (
+            2 * int(np.ceil(3 * std.max().detach().cpu().numpy())) + 1)
+    y = torch.arange(
+        window_size, dtype=std.dtype, device=std.device
+    ).view(-1, 1).repeat(1, window_size)
+    grid = torch.stack((y.t(), y), dim=-1)
+    grid -= 0.5 * (window_size - 1) # (W, W)
+    var = (std * std).unsqueeze(-1).unsqueeze(-1)
+    distsq = (grid * grid).sum(dim=-1).unsqueeze(0).repeat(*std.shape, 1, 1)
+    k = torch.exp(-0.5 * distsq / var)
+    k /= k.sum(dim=(-2, -1), keepdim=True)
+    return k
+def gaussian_lowpass(
+    x: torch.Tensor,
+    std: Union[float, Tuple[float], torch.Tensor] = None,
+    cutoff: Union[float, torch.Tensor] = None,
+    scale: Union[float, torch.Tensor] = None,
+) -> torch.Tensor:
+    if std is None:
+        cutoff = get_cutoff(cutoff, scale)
+        std = 0.5 / (np.pi * cutoff)
+    if isinstance(std, (float, int)):
+        std = (std, std)
+    if isinstance(std, torch.Tensor):
+        """Using nn.functional.conv2d with Gaussian kernels built in runtime is
+        80% faster than transforms.functional.gaussian_blur for individual
+        items.
+        (in GPU); However, in CPU, the result is exactly opposite. But you
+        won't gonna run this on CPU, right?
+        """
+        if len(list(s for s in std.shape if s != 1)) >= 2:
+            raise NotImplementedError(
+                'Anisotropic Gaussian filter is not currently available.')
+        # k.shape == (B, W, W).
+        k = gen_gauss_lowpass_filter_2d(std=std.view(-1))
+        if k.shape[0] == 1:
+            return filter_by_kernel(x, k[0], False)
+        else:
+            return filter_by_kernel(x, k, True)
+    else:
+        # Gaussian kernel size is odd in order to preserve the center.
+        window_size = tuple(2 * int(np.ceil(3 * s)) + 1 for s in std)
+        return TF.gaussian_blur(x, window_size, std)
+def blend(
+    fg: Union[torch.Tensor, Image.Image],
+    bg: Union[torch.Tensor, Image.Image],
+    mask: Union[torch.Tensor, Image.Image],
+    std: float = 0.0,
+) -> Image.Image:
+    if not isinstance(fg, torch.Tensor):
+        fg = T.ToTensor()(fg)
+    if not isinstance(bg, torch.Tensor):
+        bg = T.ToTensor()(bg)
+    if not isinstance(mask, torch.Tensor):
+        mask = (T.ToTensor()(mask) < 0.5).float()[:1]
+    if std > 0:
+        mask = gaussian_lowpass(mask[None], std)[0].clip_(0, 1)
+    return T.ToPILImage()(fg * mask + bg * (1 - mask))
+def get_panorama_views(
+    panorama_height: int,
+    panorama_width: int,
+    window_size: int = 64,
+) -> tuple[List[Tuple[int]], torch.Tensor]:
+    stride = window_size // 2
+    is_horizontal = panorama_width > panorama_height
+    num_blocks_height = (panorama_height - window_size + stride - 1) // stride + 1
+    num_blocks_width = (panorama_width - window_size + stride - 1) // stride + 1
+    total_num_blocks = num_blocks_height * num_blocks_width
+    half_fwd = torch.linspace(0, 1, (window_size + 1) // 2)
+    half_rev = half_fwd.flip(0)
+    if window_size % 2 == 1:
+        half_rev = half_rev[1:]
+    c = torch.cat((half_fwd, half_rev))
+    one = torch.ones_like(c)
+    f = c.clone()
+    f[:window_size // 2] = 1
+    b = c.clone()
+    b[-(window_size // 2):] = 1
+    h = [one] if num_blocks_height == 1 else [f] + [c] * (num_blocks_height - 2) + [b]
+    w = [one] if num_blocks_width == 1 else [f] + [c] * (num_blocks_width - 2) + [b]
+    views = []
+    masks = torch.zeros(total_num_blocks, panorama_height, panorama_width) # (n, h, w)
+    for i in range(total_num_blocks):
+        hi, wi = i // num_blocks_width, i % num_blocks_width
+        h_start = hi * stride
+        h_end = min(h_start + window_size, panorama_height)
+        w_start = wi * stride
+        w_end = min(w_start + window_size, panorama_width)
+        views.append((h_start, h_end, w_start, w_end))
+        h_width = h_end - h_start
+        w_width = w_end - w_start
+        masks[i, h_start:h_end, w_start:w_end] = h[hi][:h_width, None] * w[wi][None, :w_width]
+    # Sum of the mask weights at each pixel `masks.sum(dim=1)` must be unity.
+    return views, masks[None] # (1, n, h, w)
+def shift_to_mask_bbox_center(im: torch.Tensor, mask: torch.Tensor, reverse: bool = False) -> List[int]:
+    h, w = mask.shape[-2:]
+    device = mask.device
+    mask = mask.reshape(-1, h, w)
+    # assert mask.shape[0] == im.shape[0]
+    h_occupied = mask.sum(dim=-2) > 0
+    w_occupied = mask.sum(dim=-1) > 0
+    l = torch.argmax(h_occupied * torch.arange(w, 0, -1).to(device), 1, keepdim=True).cpu()
+    r = torch.argmax(h_occupied * torch.arange(w).to(device), 1, keepdim=True).cpu()
+    t = torch.argmax(w_occupied * torch.arange(h, 0, -1).to(device), 1, keepdim=True).cpu()
+    b = torch.argmax(w_occupied * torch.arange(h).to(device), 1, keepdim=True).cpu()
+    tb = (t + b + 1) // 2
+    lr = (l + r + 1) // 2
+    shifts = (tb - (h // 2), lr - (w // 2))
+    shifts = torch.cat(shifts, dim=1) # (p, 2)
+    if reverse:
+        shifts = shifts * -1
+    return torch.stack([i.roll(shifts=s.tolist(), dims=(-2, -1)) for i, s in zip(im, shifts)], dim=0)
+class Streamer:
+    def __init__(self, fn: Callable, ema_alpha: float = 0.9) -> None:
+        self.fn = fn
+        self.ema_alpha = ema_alpha
+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+        self.future = self.executor.submit(fn)
+        self.image = None
+        self.prev_exec_time = 0
+        self.ema_exec_time = 0
+    @property
+    def throughput(self) -> float:
+        return 1.0 / self.ema_exec_time if self.ema_exec_time else float('inf')
+    def timed_fn(self) -> Any:
+        start = time.time()
+        res = self.fn()
+        end = time.time()
+        self.prev_exec_time = end - start
+        self.ema_exec_time = self.ema_exec_time * self.ema_alpha + self.prev_exec_time * (1 - self.ema_alpha)
+        return res
+    def __call__(self) -> Any:
+        if self.future.done() or self.image is None:
+            # get the result (the new image) and start a new task
+            image = self.future.result()
+            self.future = self.executor.submit(self.timed_fn)
+            self.image = image
+            return image
+        else:
+            # if self.fn() is not ready yet, use the previous image
+            # NOTE: This assumes that we have access to a previously generated image here.
+            # If there's no previous image (i.e., this is the first invocation), you could fall
+            # back to some default image or handle it differently based on your requirements.
+            return self.image