Spaces:

Pusheen
/

LoCo

Sleeping

App Files Files Community

Pusheen commited on Feb 25

Commit

003f3f9

•

1 Parent(s): b5b5e7b

Upload 8 files

Browse files

Files changed (7) hide show

.gitattributes +0 -1
.gitignore +112 -0
README.md +4 -4
__init__.py +0 -0
app.py +590 -477
environment.yaml +29 -0
requirements.txt +15 -11

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,112 @@

+# IntelliJ project files
+.idea
+*.iml
+out
+gen
+### Vim template
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+### IPythonNotebook template
+# Temporary data
+.ipynb_checkpoints/
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+#lib/
+#lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+*.ipynb
+*.params
+# *.json
+.vscode/
+*.code-workspace/
+lib/pycocotools/_mask.c
+lib/nms/cpu_nms.c
+OUTPUT
+OUTPUT/*
+models/*
+DATASET
+DATASET/*
+external/
+MODELS
+MODELS/*
+gradio_cached_examples/*
+kill.sh
+draws/
+#:wq
+#plot/figs
+*venv/*
+# images
+# images/*
+create_samples/
+create_samples/*
+ckpts/*

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: LoCo
-emoji: 🐠
-colorFrom: red
 colorTo: purple
 sdk: gradio
-sdk_version: 3.23.0
 app_file: app.py
 pinned: false
 ---

 ---
+title: LoCo_Gligen Demo
+emoji: 👁
+colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 3.19.1
 app_file: app.py
 pinned: false
 ---

__init__.py ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,37 +1,164 @@
 import gradio as gr
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, DDIMScheduler
-from my_model import unet_2d_condition
 import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from functools import partial
 import math
-from utils import compute_loco_v2
 from gradio import processing_utils
 from typing import Optional
-from typing import List
 import warnings
-import string
-import sys
 sys.tracebacklimit = 0
 class Blocks(gr.Blocks):
     def __init__(
-            self,
-            theme: str = "default",
-            analytics_enabled: Optional[bool] = None,
-            mode: str = "blocks",
-            title: str = "Gradio",
-            css: Optional[str] = None,
-            **kwargs,
     ):
         self.extra_configs = {
             'thumbnail': kwargs.pop('thumbnail', ''),
             'url': kwargs.pop('url', 'https://gradio.app/'),
@@ -46,9 +173,82 @@ class Blocks(gr.Blocks):
         for k, v in self.extra_configs.items():
             config[k] = v
         return config
 def draw_box(boxes=[], texts=[], img=None):
     if len(boxes) == 0 and img is None:
         return None
@@ -58,111 +258,13 @@ def draw_box(boxes=[], texts=[], img=None):
     colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
     draw = ImageDraw.Draw(img)
     font = ImageFont.truetype("DejaVuSansMono.ttf", size=18)
-    print(boxes)
     for bid, box in enumerate(boxes):
         draw.rectangle([box[0], box[1], box[2], box[3]], outline=colors[bid % len(colors)], width=4)
         anno_text = texts[bid]
-        draw.rectangle(
-            [box[0], box[3] - int(font.size * 1.2), box[0] + int((len(anno_text) + 0.8) * font.size * 0.6), box[3]],
-            outline=colors[bid % len(colors)], fill=colors[bid % len(colors)], width=4)
-        draw.text([box[0] + int(font.size * 0.2), box[3] - int(font.size * 1.2)], anno_text, font=font,
-                  fill=(255, 255, 255))
     return img
-'''
-inference model
-'''
-def inference(device, unet, vae, tokenizer, text_encoder, prompt, bboxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
-    uncond_input = tokenizer(
-        ["lowres, bad anatomy, bad hands, bad faces, text, error, missing fingers, extra digit, fewer digits, \
-         cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
-    input_ids = tokenizer(
-            prompt,
-            padding="max_length",
-            truncation=True,
-            max_length=tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0].unsqueeze(0).to(device)
-    # text_embeddings = text_encoder(input_ids)[0]
-    text_embeddings = torch.cat([uncond_embeddings, text_encoder(input_ids)[0]])
-    # text_embeddings[1, 1, :] = text_embeddings[1, 2, :]
-    generator = torch.manual_seed(rand_seed)  # Seed generator to create the inital latent noise
-    latents = torch.randn(
-        (batch_size, 4, 64, 64),
-        generator=generator,
-    ).to(device)
-    # noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
-    noise_scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-    # generator = torch.Generator("cuda").manual_seed(1024)
-    noise_scheduler.set_timesteps(50)
-    latents = latents * noise_scheduler.init_noise_sigma
-    loss = torch.tensor(10000)
-    for index, t in enumerate(noise_scheduler.timesteps):
-        iteration = 0
-        while loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < max_index_step:
-            latents = latents.requires_grad_(True)
-            # latent_model_input = torch.cat([latents] * 2)
-            latent_model_input = latents
-            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
-            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
-                unet(latent_model_input, t, encoder_hidden_states=text_encoder(input_ids)[0])
-            # update latents with guidence from gaussian blob
-            loss = compute_loco_v2(attn_map_integrated_down, attn_map_integrated_mid, attn_map_integrated_up, bboxes=bboxes,
-                                   object_positions=object_positions) * loss_scale
-            # print(loss.item() / loss_scale)
-            grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
-            latents = latents - grad_cond
-            iteration += 1
-            torch.cuda.empty_cache()
-        torch.cuda.empty_cache()
-        with torch.no_grad():
-            latent_model_input = torch.cat([latents] * 2)
-            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
-            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
-                unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-            noise_pred = noise_pred.sample
-            # perform classifier-free guidance
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
-            torch.cuda.empty_cache()
-    # Decode image
-    with torch.no_grad():
-        # print("decode image")
-        latents = 1 / 0.18215 * latents
-        image = vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-        images = (image * 255).round().astype("uint8")
-        pil_images = [Image.fromarray(image) for image in images]
-        return pil_images
 def get_concat(ims):
     if len(ims) == 1:
         n_col = 1
@@ -177,94 +279,22 @@ def get_concat(ims):
     return dst
-def click_on_display(language_instruction, grounding_texts, sketch_pad,
-             loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
-             state):
-    if 'boxes' not in state:
-        state['boxes'] = []
-    boxes = state['boxes']
-    x = Image.open('./images/dog.png')
-    gen_images = [gr.Image.update(value=x, visible=True)]
-    return gen_images + [state]
-def Pharse2idx(prompt, phrases):
-    phrases = [x.strip() for x in phrases.split(';')]
-    print('phrases', phrases)
-    punc_string = string.punctuation
-        # for punc in [',', '.', ';', ':', '?', '!']:
-    for punc in punc_string:
-        prompt = prompt.replace(punc, ' ')
-    print('clear pp:', prompt)
-    prompt_list = prompt.strip('.').replace(',', '').split(' ')
-    print('prompt_list', prompt_list)
-    object_positions = []
-    for obj in phrases:
-        obj_position = []
-        for word in obj.split(' '):
-            print('word', word)
-            obj_first_index = prompt_list.index(word) + 1
-            obj_position.append(obj_first_index)
-        object_positions.append(obj_position)
-    print('object_positions', object_positions)
-    return object_positions
-def generate(unet, vae, tokenizer, text_encoder, language_instruction, grounding_texts, sketch_pad,
-             loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
-             state):
-    # language_inst: prompt; grounding_texts: phrases
-    if 'boxes' not in state:
-        state['boxes'] = []
-    boxes = state['boxes']
-#     print('raw grounding texts:', grounding_texts)
-    language_instruction= language_instruction.lower()
-    phrases = grounding_texts.lower()
-#     print('got phrases!')
-#     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
-#     print('new grd texts:',grounding_texts)
-#     # assert len(boxes) == len(grounding_texts)
-#     if len(boxes) != len(grounding_texts):
-#         if len(boxes) < len(grounding_texts):
-#             raise ValueError("""The number of boxes should be equal to the number of grounding objects.
-# Number of boxes drawn: {}, number of grounding tokens: {}.
-# Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(grounding_texts)))
-#         grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
-    boxes = (np.asarray(boxes) / 512).tolist()
-    boxes = [[box] for box in boxes]
-    # grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
-    # language_instruction_list = language_instruction.strip('.').split(' ')
-    # object_positions = []
-    # for obj in grounding_texts:
-    #     obj_position = []
-    #     for word in obj.split(' '):
-    #         obj_first_index = language_instruction_list.index(word) + 1
-    #         obj_position.append(obj_first_index)
-    #     object_positions.append(obj_position)
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    print('getting obj positions!')
-    object_positions = Pharse2idx(language_instruction, phrases)
-    gen_images = inference(device, unet, vae, tokenizer, text_encoder, language_instruction, boxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_step, rand_seed, guidance_scale)
-    blank_samples = batch_size % 2 if batch_size > 1 else 0
-    gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
-                 + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
-                 + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
-    return gen_images + [state]
-def generate_legacy(unet, vae, tokenizer, text_encoder, language_instruction, grounding_texts, sketch_pad,
-             loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
              state):
     if 'boxes' not in state:
         state['boxes'] = []
     boxes = state['boxes']
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     # assert len(boxes) == len(grounding_texts)
@@ -276,24 +306,49 @@ Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(groun
         grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
     boxes = (np.asarray(boxes) / 512).tolist()
-    boxes = [[box] for box in boxes]
-    grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
-    language_instruction_list = language_instruction.strip('.').split(' ')
-    object_positions = []
-    for obj in grounding_texts:
-        obj_position = []
-        for word in obj.split(' '):
-            obj_first_index = language_instruction_list.index(word) + 1
-            obj_position.append(obj_first_index)
-        object_positions.append(obj_position)
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    gen_images = inference(device, unet, vae, tokenizer, text_encoder, language_instruction, boxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_step, rand_seed, guidance_scale)
     blank_samples = batch_size % 2 if batch_size > 1 else 0
-    gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
-                 + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
-                 + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
     return gen_images + [state]
@@ -301,32 +356,28 @@ Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(groun
 def binarize(x):
     return (x != 0).astype('uint8') * 255
 def sized_center_crop(img, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
-    starty = y // 2 - (cropy // 2)
-    return img[starty:starty + cropy, startx:startx + cropx]
 def sized_center_fill(img, fill, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
-    starty = y // 2 - (cropy // 2)
-    img[starty:starty + cropy, startx:startx + cropx] = fill
     return img
 def sized_center_mask(img, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
-    starty = y // 2 - (cropy // 2)
-    center_region = img[starty:starty + cropy, startx:startx + cropx].copy()
     img = (img * 0.2).astype('uint8')
-    img[starty:starty + cropy, startx:startx + cropx] = center_region
     return img
 def center_crop(img, HW=None, tgt_size=(512, 512)):
     if HW is None:
         H, W = img.shape[:2]
@@ -336,27 +387,56 @@ def center_crop(img, HW=None, tgt_size=(512, 512)):
     img = img.resize(tgt_size)
     return np.array(img)
-def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
-        mask = 255 - mask[..., 0]
     image_scale = 1.0
     mask = binarize(mask)
     if type(mask) != np.ndarray:
         mask = np.array(mask)
-    if mask.sum() == 0:
         state = {}
-    image = None
     if 'boxes' not in state:
         state['boxes'] = []
@@ -385,277 +465,310 @@ def draw(input, grounding_texts, new_image_trigger, state):
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     grounding_texts = [x for x in grounding_texts if len(x) > 0]
     if len(grounding_texts) < len(state['boxes']):
-        grounding_texts += [f'Obj. {bid + 1}' for bid in range(len(grounding_texts), len(state['boxes']))]
     box_image = draw_box(state['boxes'], grounding_texts, image)
-    return [box_image, new_image_trigger, image_scale, state]
 def clear(task, sketch_pad_trigger, batch_size, state, switch_task=False):
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
-    out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)]
-    # state = {}
-    return [None, sketch_pad_trigger, None, 1.0] + out_images + [{}]
-def main():
-    css = """
-        #component-0 {
-      max-width: 550px;
-      margin: auto;
-      padding-top: 1.5rem;
-    }
-    #img2img_image, #img2img_image > .fixed-height, #img2img_image > .fixed-height > div, #img2img_image > .fixed-height > div > img
-    {
-        height: var(--height) !important;
-        max-height: var(--height) !important;
-        min-height: var(--height) !important;
-    }
-    #paper-info a {
-        color:#008AD7;
-        text-decoration: none;
-    }
-    #paper-info a:hover {
-        cursor: pointer;
-        text-decoration: none;
-    }
-    .container {
-        max-width: 550px;
-        margin: auto;
-        padding-top: 1.5rem;
-    }
-    .tooltip {
-        color: #555;
-        position: relative;
-        display: inline-block;
-        cursor: pointer;
-    }
-    .tooltip .tooltiptext {
-        visibility: hidden;
-        width: 400px;
-        background-color: #555;
-        color: #fff;
-        text-align: center;
-        padding: 5px;
-        border-radius: 5px;
-        position: absolute;
-        z-index: 1; /* Set z-index to 1 */
-        left: 10px;
-        top: 100%;
-        opacity: 0;
-        transition: opacity 0.3s;
-    }
-    .tooltip:hover .tooltiptext {
-        visibility: visible;
-        opacity: 1;
-        z-index: 9999; /* Set a high z-index value when hovering */
-    }
     """
-    rescale_js = """
-    function(x) {
-        const root = document.querySelector('gradio-app').shadowRoot || document.querySelector('gradio-app');
-        let image_scale = parseFloat(root.querySelector('#image_scale input').value) || 1.0;
-        const image_width = root.querySelector('#img2img_image').clientWidth;
-        const target_height = parseInt(image_width * image_scale);
-        document.body.style.setProperty('--height', `${target_height}px`);
-        root.querySelectorAll('button.justify-center.rounded')[0].style.display='none';
-        root.querySelectorAll('button.justify-center.rounded')[1].style.display='none';
-        return x;
-    }
-    """
-    with open('./conf/unet/config.json') as f:
-        unet_config = json.load(f)
-    sd_path = "runwayml/stable-diffusion-v1-5"
-    unet = unet_2d_condition.UNet2DConditionModel(**unet_config).from_pretrained(sd_path,
-                                                                                 subfolder="unet")
-    tokenizer = CLIPTokenizer.from_pretrained(sd_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(sd_path, subfolder="text_encoder")
-    vae = AutoencoderKL.from_pretrained(sd_path, subfolder="vae")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    unet.to(device)
-    text_encoder.to(device)
-    vae.to(device)
-    with Blocks(
-            css=css,
-            analytics_enabled=False,
-            title="LoCo: Locally Constrained Training-free Layout-to-Image Generation",
-    ) as demo:
-        description = """<p style="text-align: center; font-weight: bold;">
-            <span style="font-size: 28px">LoCo: Locally Constrained Training-free Layout-to-Image Generation</span>
-            <br>
-            <span style="font-size: 18px" id="paper-info">
-                [<a href="https://peiang-zhao.tech/LoCo/" target="_blank">Project Page</a>]
-                [<a href="https://arxiv.org/pdf/2311.12342" target="_blank">Paper</a>]
-                [<a href=" " target="_blank">GitHub</a>]
-            </span>
-            <p>Tips:
-                <ul>
-                <li>You can change the 'random seed' in 'Advanced Options' below to generate various images. </li>
-                <li>Layouts with many small bounding boxes may lead to unpleasant results. It's a tough setting for training free methods like LoCo. </li>
-                <li>Generate an image on A10G takes ~25 seconds. Upgrade the space's GPU for faster inference. :P </li>
-                </ul>
-        </p>
-        """
-        gr.HTML(description)
-        with gr.Column():
-            language_instruction = gr.Textbox(
-                label="Text Prompt (e.g., a dog and a car)",
-            )
-            grounding_instruction = gr.Textbox(
-                label="Grounding instruction (Separated by semicolon, e.g., dog;car)",
-            )
             sketch_pad_trigger = gr.Number(value=0, visible=False)
             sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
             init_white_trigger = gr.Number(value=0, visible=False)
             image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
             new_image_trigger = gr.Number(value=0, visible=False)
-            with gr.Row():
-                sketch_pad = gr.Paint(label="Sketch Pad", elem_id="img2img_image", source='canvas', shape=(512, 512))
             with gr.Row():
-                # sketch_pad = gr.Image(source='canvas', tool='sketch', size=(512, 512))
                 out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
-            with gr.Row():
-                out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
             with gr.Row():
                 clear_btn = gr.Button(value='Clear')
                 gen_btn = gr.Button(value='Generate')
             with gr.Accordion("Advanced Options", open=False):
                 with gr.Column():
-                    description = """<div class="tooltip">Loss Scale Factor &#9432
-                        <span class="tooltiptext">The scale factor of the constraints. The larger it is, the better control we get while it sometimes losses fidelity. </span>
-                        </div>
-                        <div class="tooltip">CFG Guidance Scale &#9432
-                        <span class="tooltiptext">The scale factor of classifier-free guidance. </span>
-                        </div>
-                        <div class="tooltip" >Max Iteration per Step &#9432
-                        <span class="tooltiptext">The max iterations of applying constraints in each diffusion inference process.</span>
-                        </div>
-                        <div class="tooltip" >Loss Threshold &#9432
-                        <span class="tooltiptext">The threshold of loss. If the loss computed by cross-attention map is smaller then the threshold, the guidance is stopped. </span>
-                        </div>
-                        <div class="tooltip" >Max Step of Backward Guidance &#9432
-                        <span class="tooltiptext">The max steps of guidance in diffusion inference process.</span>
-                        </div>
-                    """
-                    gr.HTML(description)
-                    Loss_scale = gr.Slider(minimum=0, maximum=200, step=5, value=50,label="Loss Scale Factor")
-                    guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="CFG Guidance Scale")
-                    batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=1, label="Number of Samples", visible=False)
-                    max_iter = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Max Iteration per Step")
-                    loss_threshold = gr.Slider(minimum=0, maximum=0.2, step=0.001, value=0.002, label="Loss Threshold")
-                    max_step = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Max Step of Guidance")
-                    rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="Random Seed")
-            state = gr.State({})
-            class Controller:
-                def __init__(self):
-                    self.calls = 0
-                    self.tracks = 0
-                    self.resizes = 0
-                    self.scales = 0
-                def init_white(self, init_white_trigger):
-                    self.calls += 1
-                    return np.ones((512, 512), dtype='uint8') * 255, 1.0, init_white_trigger + 1
-                def change_n_samples(self, n_samples):
-                    blank_samples = n_samples % 2 if n_samples > 1 else 0
-                    return [gr.Image.update(visible=True) for _ in range(n_samples + blank_samples)] \
-                        + [gr.Image.update(visible=False) for _ in range(4 - n_samples - blank_samples)]
-            controller = Controller()
-            demo.load(
-                lambda x: x + 1,
-                inputs=sketch_pad_trigger,
-                outputs=sketch_pad_trigger,
-                queue=False)
-            sketch_pad.edit(
-                draw,
-                inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
-                outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
-                queue=False,
-            )
-            grounding_instruction.change(
-                draw,
-                inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
-                outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
-                queue=False,
-            )
-            clear_btn.click(
-                clear,
-                inputs=[sketch_pad_trigger, sketch_pad_trigger, batch_size, state],
-                outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, state],
-                queue=False)
-            sketch_pad_trigger.change(
-                controller.init_white,
-                inputs=[init_white_trigger],
-                outputs=[sketch_pad, image_scale, init_white_trigger],
-                queue=False)
-            gen_btn.click(
-                fn=partial(generate, unet, vae, tokenizer, text_encoder,),
-                inputs=[
-                    language_instruction, grounding_instruction, sketch_pad,
-                    loss_threshold, guidance_scale, batch_size, rand_seed,
-                    max_step,
-                    Loss_scale, max_iter,
-                    state,
                 ],
-                outputs=[out_gen_1, state],
-                queue=True
-            )
-            sketch_pad_resize_trigger.change(
-                None,
-                None,
-                sketch_pad_resize_trigger,
-                _js=rescale_js,
-                queue=False)
-            init_white_trigger.change(
-                None,
-                None,
-                init_white_trigger,
-                _js=rescale_js,
-                queue=False)
-        with gr.Column():
-            gr.Examples(
-                examples=[
-                    [
-                        # "images/input.png",
-                        "An airplane and a chair on the grassland.",
-                        "airplane;chair",
-                        "images/airplane_chair.png"
-                    ],
                 ],
-                inputs=[language_instruction, grounding_instruction, out_gen_1],
-                outputs=None,
-                fn=None,
-                cache_examples=False,
-            )
-        description = """<p> Some source codes of the demo are modified based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GlIGen</a> and <a href="https://huggingface.co/spaces/silentchen/layout-guidance">Layout-guidance</a>. Thanks! </p>"""
-        gr.HTML(description)
-    demo.queue(concurrency_count=1, api_open=False)
-    demo.launch(share=False, show_api=False, show_error=True)
-if __name__ == '__main__':
-    main()

 import gradio as gr
 import torch
+from omegaconf import OmegaConf
+from gligen.task_grounded_generation import grounded_generation_box, load_ckpt, load_common_ckpt
 import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from functools import partial
+from collections import Counter
 import math
+import gc
 from gradio import processing_utils
 from typing import Optional
 import warnings
+from datetime import datetime
+from huggingface_hub import hf_hub_download
+hf_hub_download = partial(hf_hub_download, library_name="gligen_demo")
+import sys
 sys.tracebacklimit = 0
+def load_from_hf(repo_id, filename='diffusion_pytorch_model.bin', subfolder=None):
+    cache_file = hf_hub_download(repo_id=repo_id, filename=filename, subfolder=subfolder)
+    return torch.load(cache_file, map_location='cpu')
+def load_ckpt_config_from_hf(modality):
+    ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='model')
+    config = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='config')
+    return ckpt, config
+def ckpt_load_helper(modality, is_inpaint, is_style, common_instances=None):
+    pretrained_ckpt_gligen, config = load_ckpt_config_from_hf(modality)
+    config = OmegaConf.create( config["_content"] ) # config used in training
+    config.alpha_scale = 1.0
+    config.model['params']['is_inpaint'] = is_inpaint
+    config.model['params']['is_style'] = is_style
+    if common_instances is None:
+        common_ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'common.pth', subfolder='model')
+        common_instances = load_common_ckpt(config, common_ckpt)
+    loaded_model_list = load_ckpt(config, pretrained_ckpt_gligen, common_instances)
+    return loaded_model_list, common_instances
+class Instance:
+    def __init__(self, capacity = 2):
+        self.model_type = 'base'
+        self.loaded_model_list = {}
+        self.counter = Counter()
+        self.global_counter = Counter()
+        self.loaded_model_list['base'], self.common_instances = ckpt_load_helper(
+            'gligen-generation-text-box',
+            is_inpaint=False, is_style=False, common_instances=None
+        )
+        self.capacity = capacity
+    def _log(self, model_type, batch_size, instruction, phrase_list):
+        self.counter[model_type] += 1
+        self.global_counter[model_type] += 1
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        print('[{}] Current: {}, All: {}. Samples: {}, prompt: {}, phrases: {}'.format(
+            current_time, dict(self.counter), dict(self.global_counter), batch_size, instruction, phrase_list
+        ))
+    def get_model(self, model_type, batch_size, instruction, phrase_list):
+        if model_type in self.loaded_model_list:
+            self._log(model_type, batch_size, instruction, phrase_list)
+            return self.loaded_model_list[model_type]
+        if self.capacity == len(self.loaded_model_list):
+            least_used_type = self.counter.most_common()[-1][0]
+            del self.loaded_model_list[least_used_type]
+            del self.counter[least_used_type]
+            gc.collect()
+            torch.cuda.empty_cache()
+        self.loaded_model_list[model_type] = self._get_model(model_type)
+        self._log(model_type, batch_size, instruction, phrase_list)
+        return self.loaded_model_list[model_type]
+    def _get_model(self, model_type):
+        if model_type == 'base':
+            return ckpt_load_helper(
+                'gligen-generation-text-box',
+                is_inpaint=False, is_style=False, common_instances=self.common_instances
+            )[0]
+        elif model_type == 'inpaint':
+            return ckpt_load_helper(
+                'gligen-inpainting-text-box',
+                is_inpaint=True, is_style=False, common_instances=self.common_instances
+            )[0]
+        elif model_type == 'style':
+            return ckpt_load_helper(
+                'gligen-generation-text-image-box',
+                is_inpaint=False, is_style=True, common_instances=self.common_instances
+            )[0]
+        assert False
+instance = Instance()
+def load_clip_model():
+    from transformers import CLIPProcessor, CLIPModel
+    version = "openai/clip-vit-large-patch14"
+    model = CLIPModel.from_pretrained(version).cuda()
+    processor = CLIPProcessor.from_pretrained(version)
+    return {
+        'version': version,
+        'model': model,
+        'processor': processor,
+    }
+clip_model = load_clip_model()
+class ImageMask(gr.components.Image):
+    """
+    Sets: source="canvas", tool="sketch"
+    """
+    is_template = True
+    def __init__(self, **kwargs):
+        super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
+    def preprocess(self, x):
+        if x is None:
+            return x
+        if self.tool == "sketch" and self.source in ["upload", "webcam"] and type(x) != dict:
+            decode_image = processing_utils.decode_base64_to_image(x)
+            width, height = decode_image.size
+            mask = np.zeros((height, width, 4), dtype=np.uint8)
+            mask[..., -1] = 255
+            mask = self.postprocess(mask)
+            x = {'image': x, 'mask': mask}
+        return super().preprocess(x)
 class Blocks(gr.Blocks):
     def __init__(
+        self,
+        theme: str = "default",
+        analytics_enabled: Optional[bool] = None,
+        mode: str = "blocks",
+        title: str = "Gradio",
+        css: Optional[str] = None,
+        **kwargs,
     ):
         self.extra_configs = {
             'thumbnail': kwargs.pop('thumbnail', ''),
             'url': kwargs.pop('url', 'https://gradio.app/'),
         for k, v in self.extra_configs.items():
             config[k] = v
         return config
+'''
+inference model
+'''
+@torch.no_grad()
+def inference(task, language_instruction, grounding_instruction, inpainting_boxes_nodrop, image,
+              alpha_sample, guidance_scale, batch_size,
+              fix_seed, rand_seed, actual_mask, style_image,
+              *args, **kwargs):
+    grounding_instruction = json.loads(grounding_instruction)
+    phrase_list, location_list = [], []
+    for k, v  in grounding_instruction.items():
+        phrase_list.append(k)
+        location_list.append(v)
+    placeholder_image = Image.open('images/teddy.jpg').convert("RGB")
+    image_list = [placeholder_image] * len(phrase_list) # placeholder input for visual prompt, which is disabled
+    batch_size = int(batch_size)
+    if not 1 <= batch_size <= 4:
+        batch_size = 2
+    if style_image == None:
+        has_text_mask = 1
+        has_image_mask = 0 # then we hack above 'image_list'
+    else:
+        valid_phrase_len = len(phrase_list)
+        phrase_list += ['placeholder']
+        has_text_mask = [1]*valid_phrase_len + [0]
+        image_list = [placeholder_image]*valid_phrase_len + [style_image]
+        has_image_mask = [0]*valid_phrase_len + [1]
+        location_list += [ [0.0, 0.0, 1, 0.01]  ] # style image grounding location
+    if task == 'Grounded Inpainting':
+        alpha_sample = 1.0
+    instruction = dict(
+        prompt = language_instruction,
+        phrases = phrase_list,
+        images = image_list,
+        locations = location_list,
+        alpha_type = [alpha_sample, 0, 1.0 - alpha_sample],
+        has_text_mask = has_text_mask,
+        has_image_mask = has_image_mask,
+        save_folder_name = language_instruction,
+        guidance_scale = guidance_scale,
+        batch_size = batch_size,
+        fix_seed = bool(fix_seed),
+        rand_seed = int(rand_seed),
+        actual_mask = actual_mask,
+        inpainting_boxes_nodrop = inpainting_boxes_nodrop,
+    )
+    get_model = partial(instance.get_model,
+                        batch_size=batch_size,
+                        instruction=language_instruction,
+                        phrase_list=phrase_list)
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        if task == 'Grounded Generation':
+            if style_image == None:
+                return grounded_generation_box(get_model('base'), instruction, *args, **kwargs)
+            else:
+                return grounded_generation_box(get_model('style'), instruction, *args, **kwargs)
+        elif task == 'Grounded Inpainting':
+            assert image is not None
+            instruction['input_image'] = image.convert("RGB")
+            return grounded_generation_box(get_model('inpaint'), instruction, *args, **kwargs)
 def draw_box(boxes=[], texts=[], img=None):
     if len(boxes) == 0 and img is None:
         return None
     colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
     draw = ImageDraw.Draw(img)
     font = ImageFont.truetype("DejaVuSansMono.ttf", size=18)
     for bid, box in enumerate(boxes):
         draw.rectangle([box[0], box[1], box[2], box[3]], outline=colors[bid % len(colors)], width=4)
         anno_text = texts[bid]
+        draw.rectangle([box[0], box[3] - int(font.size * 1.2), box[0] + int((len(anno_text) + 0.8) * font.size * 0.6), box[3]], outline=colors[bid % len(colors)], fill=colors[bid % len(colors)], width=4)
+        draw.text([box[0] + int(font.size * 0.2), box[3] - int(font.size*1.2)], anno_text, font=font, fill=(255,255,255))
     return img
 def get_concat(ims):
     if len(ims) == 1:
         n_col = 1
     return dst
+def auto_append_grounding(language_instruction, grounding_texts):
+    for grounding_text in grounding_texts:
+        if grounding_text not in language_instruction and grounding_text != 'auto':
+            language_instruction += "; " + grounding_text
+    return language_instruction
+def generate(task, language_instruction, grounding_texts, sketch_pad,
+             alpha_sample, guidance_scale, batch_size,
+             fix_seed, rand_seed, use_actual_mask, append_grounding, style_cond_image,
              state):
     if 'boxes' not in state:
         state['boxes'] = []
     boxes = state['boxes']
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     # assert len(boxes) == len(grounding_texts)
         grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
     boxes = (np.asarray(boxes) / 512).tolist()
+    grounding_instruction = json.dumps({obj: box for obj,box in zip(grounding_texts, boxes)})
+    image = None
+    actual_mask = None
+    if task == 'Grounded Inpainting':
+        image = state.get('original_image', sketch_pad['image']).copy()
+        image = center_crop(image)
+        image = Image.fromarray(image)
+        if use_actual_mask:
+            actual_mask = sketch_pad['mask'].copy()
+            if actual_mask.ndim == 3:
+                actual_mask = actual_mask[..., 0]
+            actual_mask = center_crop(actual_mask, tgt_size=(64, 64))
+            actual_mask = torch.from_numpy(actual_mask == 0).float()
+        if state.get('inpaint_hw', None):
+            boxes = np.asarray(boxes) * 0.9 + 0.05
+            boxes = boxes.tolist()
+            grounding_instruction = json.dumps({obj: box for obj,box in zip(grounding_texts, boxes) if obj != 'auto'})
+    if append_grounding:
+        language_instruction = auto_append_grounding(language_instruction, grounding_texts)
+    gen_images, gen_overlays = inference(
+        task, language_instruction, grounding_instruction, boxes, image,
+        alpha_sample, guidance_scale, batch_size,
+        fix_seed, rand_seed, actual_mask, style_cond_image, clip_model=clip_model,
+    )
+    for idx, gen_image in enumerate(gen_images):
+        if task == 'Grounded Inpainting' and state.get('inpaint_hw', None):
+            hw = min(*state['original_image'].shape[:2])
+            gen_image = sized_center_fill(state['original_image'].copy(), np.array(gen_image.resize((hw, hw))), hw, hw)
+            gen_image = Image.fromarray(gen_image)
+        gen_images[idx] = gen_image
     blank_samples = batch_size % 2 if batch_size > 1 else 0
+    gen_images = [gr.Image.update(value=x, visible=True) for i,x in enumerate(gen_images)] \
+                    + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
+                    + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
     return gen_images + [state]
 def binarize(x):
     return (x != 0).astype('uint8') * 255
 def sized_center_crop(img, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
+    starty = y // 2 - (cropy // 2)
+    return img[starty:starty+cropy, startx:startx+cropx]
 def sized_center_fill(img, fill, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
+    starty = y // 2 - (cropy // 2)
+    img[starty:starty+cropy, startx:startx+cropx] = fill
     return img
 def sized_center_mask(img, cropx, cropy):
     y, x = img.shape[:2]
     startx = x // 2 - (cropx // 2)
+    starty = y // 2 - (cropy // 2)
+    center_region = img[starty:starty+cropy, startx:startx+cropx].copy()
     img = (img * 0.2).astype('uint8')
+    img[starty:starty+cropy, startx:startx+cropx] = center_region
     return img
 def center_crop(img, HW=None, tgt_size=(512, 512)):
     if HW is None:
         H, W = img.shape[:2]
     img = img.resize(tgt_size)
     return np.array(img)
+def draw(task, input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
+        mask = mask[..., 0]
     image_scale = 1.0
+    # resize trigger
+    if task == "Grounded Inpainting":
+        mask_cond = mask.sum() == 0
+        # size_cond = mask.shape != (512, 512)
+        if mask_cond and 'original_image' not in state:
+            image = Image.fromarray(image)
+            width, height = image.size
+            scale = 600 / min(width, height)
+            image = image.resize((int(width * scale), int(height * scale)))
+            state['original_image'] = np.array(image).copy()
+            image_scale = float(height / width)
+            return [None, new_image_trigger + 1, image_scale, state]
+        else:
+            original_image = state['original_image']
+            H, W = original_image.shape[:2]
+            image_scale = float(H / W)
+    mask = binarize(mask)
+    if mask.shape != (512, 512):
+        # assert False, "should not receive any non- 512x512 masks."
+        if 'original_image' in state and state['original_image'].shape[:2] == mask.shape:
+            mask = center_crop(mask, state['inpaint_hw'])
+            image = center_crop(state['original_image'], state['inpaint_hw'])
+        else:
+            mask = np.zeros((512, 512), dtype=np.uint8)
+    # mask = center_crop(mask)
     mask = binarize(mask)
     if type(mask) != np.ndarray:
         mask = np.array(mask)
+    if mask.sum() == 0 and task != "Grounded Inpainting":
         state = {}
+    if task != 'Grounded Inpainting':
+        image = None
+    else:
+        image = Image.fromarray(image)
     if 'boxes' not in state:
         state['boxes'] = []
     grounding_texts = [x.strip() for x in grounding_texts.split(';')]
     grounding_texts = [x for x in grounding_texts if len(x) > 0]
     if len(grounding_texts) < len(state['boxes']):
+        grounding_texts += [f'Obj. {bid+1}' for bid in range(len(grounding_texts), len(state['boxes']))]
     box_image = draw_box(state['boxes'], grounding_texts, image)
+    if box_image is not None and state.get('inpaint_hw', None):
+        inpaint_hw = state['inpaint_hw']
+        box_image_resize = np.array(box_image.resize((inpaint_hw, inpaint_hw)))
+        original_image = state['original_image'].copy()
+        box_image = sized_center_fill(original_image, box_image_resize, inpaint_hw, inpaint_hw)
+    return [box_image, new_image_trigger, image_scale, state]
 def clear(task, sketch_pad_trigger, batch_size, state, switch_task=False):
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
+    out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)] \
+                    + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
+                    + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
+    state = {}
+    return [None, sketch_pad_trigger, None, 1.0] + out_images + [state]
+css = """
+#img2img_image, #img2img_image > .fixed-height, #img2img_image > .fixed-height > div, #img2img_image > .fixed-height > div > img
+{
+    height: var(--height) !important;
+    max-height: var(--height) !important;
+    min-height: var(--height) !important;
+}
+#paper-info a {
+    color:#008AD7;
+    text-decoration: none;
+}
+#paper-info a:hover {
+    cursor: pointer;
+    text-decoration: none;
+}
+"""
+rescale_js = """
+function(x) {
+    const root = document.querySelector('gradio-app').shadowRoot || document.querySelector('gradio-app');
+    let image_scale = parseFloat(root.querySelector('#image_scale input').value) || 1.0;
+    const image_width = root.querySelector('#img2img_image').clientWidth;
+    const target_height = parseInt(image_width * image_scale);
+    document.body.style.setProperty('--height', `${target_height}px`);
+    root.querySelectorAll('button.justify-center.rounded')[0].style.display='none';
+    root.querySelectorAll('button.justify-center.rounded')[1].style.display='none';
+    return x;
+}
+"""
+with Blocks(
+    css=css,
+    analytics_enabled=False,
+    title="GLIGen demo",
+) as main:
+    description = """<p style="text-align: center; font-weight: bold;">
+        <span style="font-size: 28px">GLIGen: Open-Set Grounded Text-to-Image Generation</span>
+        <br>
+        <span style="font-size: 18px" id="paper-info">
+            [<a href="https://gligen.github.io" target="_blank">Project Page</a>]
+            [<a href="https://arxiv.org/abs/2301.07093" target="_blank">Paper</a>]
+            [<a href="https://github.com/gligen/GLIGEN" target="_blank">GitHub</a>]
+        </span>
+    </p>
+    <p>
+        To ground concepts of interest with desired spatial specification, please (1) &#9000;&#65039; enter the concept names in <em> Grounding Instruction</em>, and (2) &#128433;&#65039; draw their corresponding bounding boxes one by one using <em> Sketch Pad</em> -- the parsed boxes will be displayed automatically.
+        <br>
+        For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/gligen/demo?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a>
+    </p>
     """
+    gr.HTML(description)
+    with gr.Row():
+        with gr.Column(scale=4):
             sketch_pad_trigger = gr.Number(value=0, visible=False)
             sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
             init_white_trigger = gr.Number(value=0, visible=False)
             image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
             new_image_trigger = gr.Number(value=0, visible=False)
+            task = gr.Radio(
+                choices=["Grounded Generation", 'Grounded Inpainting'],
+                type="value",
+                value="Grounded Generation",
+                label="Task",
+            )
+            language_instruction = gr.Textbox(
+                label="Language instruction",
+            )
+            grounding_instruction = gr.Textbox(
+                label="Grounding instruction (Separated by semicolon)",
+            )
             with gr.Row():
+                sketch_pad = ImageMask(label="Sketch Pad", elem_id="img2img_image")
                 out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
             with gr.Row():
                 clear_btn = gr.Button(value='Clear')
                 gen_btn = gr.Button(value='Generate')
             with gr.Accordion("Advanced Options", open=False):
                 with gr.Column():
+                    alpha_sample = gr.Slider(minimum=0, maximum=1.0, step=0.1, value=0.3, label="Scheduled Sampling (τ)")
+                    guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale")
+                    batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=2, label="Number of Samples")
+                    append_grounding = gr.Checkbox(value=True, label="Append grounding instructions to the caption")
+                    use_actual_mask = gr.Checkbox(value=False, label="Use actual mask for inpainting", visible=False)
+                    with gr.Row():
+                        fix_seed = gr.Checkbox(value=True, label="Fixed seed")
+                        rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=0, label="Seed")
+                    with gr.Row():
+                        use_style_cond = gr.Checkbox(value=False, label="Enable Style Condition")
+                        style_cond_image = gr.Image(type="pil", label="Style Condition", visible=False, interactive=True)
+        with gr.Column(scale=4):
+            gr.HTML('<span style="font-size: 20px; font-weight: bold">Generated Images</span>')
+            with gr.Row():
+                out_gen_1 = gr.Image(type="pil", visible=True, show_label=False)
+                out_gen_2 = gr.Image(type="pil", visible=True, show_label=False)
+            with gr.Row():
+                out_gen_3 = gr.Image(type="pil", visible=False, show_label=False)
+                out_gen_4 = gr.Image(type="pil", visible=False, show_label=False)
+        state = gr.State({})
+        class Controller:
+            def __init__(self):
+                self.calls = 0
+                self.tracks = 0
+                self.resizes = 0
+                self.scales = 0
+            def init_white(self, init_white_trigger):
+                self.calls += 1
+                return np.ones((512, 512), dtype='uint8') * 255, 1.0, init_white_trigger+1
+            def change_n_samples(self, n_samples):
+                blank_samples = n_samples % 2 if n_samples > 1 else 0
+                return [gr.Image.update(visible=True) for _ in range(n_samples + blank_samples)] \
+                    + [gr.Image.update(visible=False) for _ in range(4 - n_samples - blank_samples)]
+            def resize_centercrop(self, state):
+                self.resizes += 1
+                image = state['original_image'].copy()
+                inpaint_hw = int(0.9 * min(*image.shape[:2]))
+                state['inpaint_hw'] = inpaint_hw
+                image_cc = center_crop(image, inpaint_hw)
+                # print(f'resize triggered {self.resizes}', image.shape, '->', image_cc.shape)
+                return image_cc, state
+            def resize_masked(self, state):
+                self.resizes += 1
+                image = state['original_image'].copy()
+                inpaint_hw = int(0.9 * min(*image.shape[:2]))
+                state['inpaint_hw'] = inpaint_hw
+                image_mask = sized_center_mask(image, inpaint_hw, inpaint_hw)
+                state['masked_image'] = image_mask.copy()
+                # print(f'mask triggered {self.resizes}')
+                return image_mask, state
+            def switch_task_hide_cond(self, task):
+                cond = False
+                if task == "Grounded Generation":
+                    cond = True
+                return gr.Checkbox.update(visible=cond, value=False), gr.Image.update(value=None, visible=False), gr.Slider.update(visible=cond), gr.Checkbox.update(visible=(not cond), value=False)
+        controller = Controller()
+        main.load(
+            lambda x:x+1,
+            inputs=sketch_pad_trigger,
+            outputs=sketch_pad_trigger,
+            queue=False)
+        sketch_pad.edit(
+            draw,
+            inputs=[task, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
+            outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
+            queue=False,
+        )
+        grounding_instruction.change(
+            draw,
+            inputs=[task, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
+            outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
+            queue=False,
+        )
+        clear_btn.click(
+            clear,
+            inputs=[task, sketch_pad_trigger, batch_size, state],
+            outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, out_gen_2, out_gen_3, out_gen_4, state],
+            queue=False)
+        task.change(
+            partial(clear, switch_task=True),
+            inputs=[task, sketch_pad_trigger, batch_size, state],
+            outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, out_gen_2, out_gen_3, out_gen_4, state],
+            queue=False)
+        sketch_pad_trigger.change(
+            controller.init_white,
+            inputs=[init_white_trigger],
+            outputs=[sketch_pad, image_scale, init_white_trigger],
+            queue=False)
+        sketch_pad_resize_trigger.change(
+            controller.resize_masked,
+            inputs=[state],
+            outputs=[sketch_pad, state],
+            queue=False)
+        batch_size.change(
+            controller.change_n_samples,
+            inputs=[batch_size],
+            outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4],
+            queue=False)
+        gen_btn.click(
+            generate,
+            inputs=[
+                task, language_instruction, grounding_instruction, sketch_pad,
+                alpha_sample, guidance_scale, batch_size,
+                fix_seed, rand_seed,
+                use_actual_mask,
+                append_grounding, style_cond_image,
+                state,
+            ],
+            outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4, state],
+            queue=True
+        )
+        sketch_pad_resize_trigger.change(
+            None,
+            None,
+            sketch_pad_resize_trigger,
+            _js=rescale_js,
+            queue=False)
+        init_white_trigger.change(
+            None,
+            None,
+            init_white_trigger,
+            _js=rescale_js,
+            queue=False)
+        use_style_cond.change(
+            lambda cond: gr.Image.update(visible=cond),
+            use_style_cond,
+            style_cond_image,
+            queue=False)
+        task.change(
+            controller.switch_task_hide_cond,
+            inputs=task,
+            outputs=[use_style_cond, style_cond_image, alpha_sample, use_actual_mask],
+            queue=False)
+    with gr.Column():
+        gr.Examples(
+            examples=[
+                [
+                    "images/blank.png",
+                    "Grounded Generation",
+                    "a dog and an apple",
+                    "a dog;an apple",
                 ],
+                [
+                    "images/blank.png",
+                    "Grounded Generation",
+                    "John Lennon is using a pc",
+                    "John Lennon;a pc",
+                [
+                    "images/blank.png",
+                    "Grounded Generation",
+                    "a painting of a fox sitting in a field at sunrise in the style of Claude Mone",
+                    "fox;sunrise",
                 ],
+                ],
+                [
+                    "images/blank.png",
+                    "Grounded Generation",
+                    "a beautiful painting of hot dog by studio ghibli, octane render, brilliantly coloured",
+                    "hot dog",
+                ],
+                [
+                    "images/blank.png",
+                    "Grounded Generation",
+                    "a sport car, unreal engine, global illumination, ray tracing",
+                    "a sport car",
+                ],
+                [
+                    "images/flower_beach.jpg",
+                    "Grounded Inpainting",
+                    "a squirrel and the space needle",
+                    "a squirrel;the space needle",
+                ],
+                [
+                    "images/arg_corgis.jpeg",
+                    "Grounded Inpainting",
+                    "a dog and a birthday cake",
+                    "a dog; a birthday cake",
+                ],
+                [
+                    "images/teddy.jpg",
+                    "Grounded Inpainting",
+                    "a teddy bear wearing a santa claus red shirt; holding a Christmas gift box on hand",
+                    "a santa claus shirt; a Christmas gift box",
+                ],
+            ],
+            inputs=[sketch_pad, task, language_instruction, grounding_instruction],
+            outputs=None,
+            fn=None,
+            cache_examples=False,
+        )
+main.queue(concurrency_count=1, api_open=False)
+main.launch(share=False, show_api=False, show_error=True)

environment.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: loco_gligen_demo
+channels:
+  - xformers/label/dev
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.10.8
+  - pip=22.2.2
+  - cudatoolkit=11.3
+  - pytorch=1.12.1
+  - torchvision=0.13.1
+  - numpy=1.23.1
+  - xformers
+  - pip:
+    - omegaconf==2.1.1
+    - albumentations==1.3.0
+    - opencv-python
+    - imageio==2.9.0
+    - imageio-ffmpeg==0.4.2
+    - pytorch-lightning==1.4.2
+    - test-tube>=0.7.5
+    - streamlit==1.12.1
+    - einops==0.3.0
+    - git+https://github.com/openai/CLIP.git
+    - protobuf~=3.20.1
+    - torchmetrics==0.6.0
+    - transformers==4.19.2
+    - kornia==0.6.0
+    - gradio==3.16.0

requirements.txt CHANGED Viewed

@@ -1,14 +1,18 @@
---extra-index-url https://download.pytorch.org/whl/cu113
-torch
-torchvision==0.14.0
-omegaconf==2.2.3
 opencv-python
 imageio==2.9.0
-transformers==4.24.0
-diffusers==0.7.2
-accelerate==0.13.2
-scipy==1.9.1
 git+https://github.com/openai/CLIP.git
-hydra-core==1.2.0
-tqdm
-gradio==3.23.0

+torch==1.13.1
+torchvision==0.14.1
+xformers==0.0.16
+omegaconf==2.1.1
+albumentations==1.3.0
 opencv-python
 imageio==2.9.0
+imageio-ffmpeg==0.4.2
+pytorch-lightning==1.4.2
+test-tube>=0.7.5
+streamlit==1.17.0
+einops==0.3.0
 git+https://github.com/openai/CLIP.git
+protobuf~=3.20.1
+torchmetrics==0.6.0
+transformers==4.19.2
+kornia==0.6.0
+gradio==3.19.1