Spaces:

Adapter
/

T2I-Adapter

Runtime error

App Files Files Community

LiangbinXie commited on Mar 6, 2023

Commit

0177fec

•

1 Parent(s): aa0bbd7

add composable adapter

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -5
app.py +277 -63
{models → configs/mm}/faster_rcnn_r50_fpn_coco.py +182 -182
{models → configs/mm}/hrnet_w48_coco_256x192.py +169 -169
configs/stable-diffusion/sd-v1-inference.yaml +65 -0
configs/stable-diffusion/sd-v1-train.yaml +86 -0
configs/stable-diffusion/train_keypose.yaml +87 -0
configs/stable-diffusion/train_mask.yaml +87 -0
configs/stable-diffusion/train_sketch.yaml +87 -0
demo/demos.py +0 -309
demo/model.py +0 -979
dist_util.py +91 -0
docs/AdapterZoo.md +16 -0
docs/FAQ.md +5 -0
docs/examples.md +41 -0
environment.yaml +0 -31
ldm/modules/structure_condition/midas/__init__.py → experiments/README.md +0 -0
ldm/data/base.py +0 -23
ldm/data/dataset_coco.py +36 -0
ldm/data/dataset_depth.py +35 -0
ldm/data/dataset_laion.py +130 -0
ldm/data/dataset_wikiart.py +67 -0
ldm/data/imagenet.py +0 -394
ldm/data/lsun.py +0 -92
ldm/data/utils.py +40 -0
ldm/inference_base.py +282 -0
ldm/models/autoencoder.py +43 -275
ldm/models/diffusion/classifier.py +0 -267
ldm/models/diffusion/ddim.py +68 -17
ldm/models/diffusion/ddpm.py +251 -384
ldm/models/diffusion/dpm_solver/dpm_solver.py +152 -119
ldm/models/diffusion/dpm_solver/sampler.py +8 -3
ldm/models/diffusion/plms.py +23 -48
ldm/modules/attention.py +4 -0
ldm/modules/diffusionmodules/openaimodel.py +85 -263
ldm/modules/diffusionmodules/util.py +5 -2
ldm/modules/ema.py +12 -8
ldm/modules/encoders/adapter.py +84 -76
ldm/modules/encoders/modules.py +349 -142
ldm/modules/{structure_condition → extra_condition}/__init__.py +0 -0
ldm/modules/extra_condition/api.py +269 -0
ldm/modules/{structure_condition/midas → extra_condition}/midas/__init__.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/api.py +4 -4
ldm/modules/{structure_condition/openpose → extra_condition/midas/midas}/__init__.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/base_model.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/blocks.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/dpt_depth.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/midas_net.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/midas_net_custom.py +0 -0
ldm/modules/{structure_condition → extra_condition}/midas/midas/transforms.py +0 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,3 @@
-# ignored folders
-models
 # ignored folders
 tmp/*
@@ -23,7 +20,6 @@ version.py
 # Byte-compiled / optimized / DLL files
 __pycache__/
-*.pyc
 *.py[cod]
 *$py.class
@@ -125,4 +121,4 @@ venv.bak/
 /site
 # mypy
-.mypy_cache/

 # ignored folders
 tmp/*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 /site
 # mypy
+.mypy_cache/

app.py CHANGED Viewed

@@ -1,29 +1,44 @@
 import os
-# os.system('pip3 install openmim')
-os.system('mim install mmcv-full==1.7.0')
-# os.system('pip3 install mmpose')
-# os.system('pip3 install mmdet')
-# os.system('pip3 install gradio==3.19.1')
-#os.system('pip3 install psutil')
-from demo.model import Model_all
 import gradio as gr
-from demo.demos import create_demo_keypose, create_demo_sketch, create_demo_draw, create_demo_seg, create_demo_depth, create_demo_depth_keypose, create_demo_color, create_demo_color_sketch, create_demo_openpose, create_demo_style_sketch, create_demo_canny
 import torch
-import subprocess
-import shlex
 from huggingface_hub import hf_hub_url
 urls = {
-    'TencentARC/T2I-Adapter':['models/t2iadapter_keypose_sd14v1.pth', 'models/t2iadapter_color_sd14v1.pth', 'models/t2iadapter_openpose_sd14v1.pth', 'models/t2iadapter_seg_sd14v1.pth', 'models/t2iadapter_sketch_sd14v1.pth', 'models/t2iadapter_depth_sd14v1.pth','third-party-models/body_pose_model.pth', "models/t2iadapter_style_sd14v1.pth", "models/t2iadapter_canny_sd14v1.pth"],
-    'CompVis/stable-diffusion-v-1-4-original':['sd-v1-4.ckpt'],
-    'andite/anything-v4.0':['anything-v4.0-pruned.ckpt', 'anything-v4.0.vae.pt'],
 }
-urls_mmpose = [
-    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth',
-    'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth',
-    'https://github.com/kazuto1011/deeplab-pytorch/releases/download/v1.0/deeplabv2_resnet101_msc-cocostuff164k-100000.pth'
-]
 if os.path.exists('models') == False:
     os.mkdir('models')
 for repo in urls:
@@ -31,58 +46,257 @@ for repo in urls:
     for file in files:
         url = hf_hub_url(repo, file)
         name_ckp = url.split('/')[-1]
-        save_path = os.path.join('models',name_ckp)
         if os.path.exists(save_path) == False:
             subprocess.run(shlex.split(f'wget {url} -O {save_path}'))
-for url in urls_mmpose:
-    name_ckp = url.split('/')[-1]
-    save_path = os.path.join('models',name_ckp)
-    if os.path.exists(save_path) == False:
-        subprocess.run(shlex.split(f'wget {url} -O {save_path}'))
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-model = Model_all(device)
-DESCRIPTION = '''# T2I-Adapter
-Gradio demo for **T2I-Adapter**: [[GitHub]](https://github.com/TencentARC/T2I-Adapter), [[Paper]](https://arxiv.org/abs/2302.08453).
-It also supports **multiple adapters** in the follwing tabs showing **"A adapter + B adapter"**.
-If T2I-Adapter is helpful, please help to ⭐ the [Github Repo](https://github.com/TencentARC/T2I-Adapter) and recommend it to your friends 😊
-'''
 with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
-    gr.HTML("""<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
-    <br/>
-    <a href="https://huggingface.co/spaces/Adapter/T2I-Adapter?duplicate=true">
-    <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-    <p/>""")
-    with gr.Tabs():
-        with gr.TabItem('Openpose'):
-            create_demo_openpose(model.process_openpose)
-        with gr.TabItem('Keypose'):
-            create_demo_keypose(model.process_keypose)
-        with gr.TabItem('Canny'):
-            create_demo_canny(model.process_canny)
-        with gr.TabItem('Sketch'):
-            create_demo_sketch(model.process_sketch)
-        with gr.TabItem('Draw'):
-            create_demo_draw(model.process_draw)
-        with gr.TabItem('Depth'):
-            create_demo_depth(model.process_depth)
-        with gr.TabItem('Depth + Keypose'):
-            create_demo_depth_keypose(model.process_depth_keypose)
-        with gr.TabItem('Color'):
-            create_demo_color(model.process_color)
-        with gr.TabItem('Color + Sketch'):
-            create_demo_color_sketch(model.process_color_sketch)
-        with gr.TabItem('Style + Sketch'):
-            create_demo_style_sketch(model.process_style_sketch)
-        with gr.TabItem('Segmentation'):
-            create_demo_seg(model.process_seg)
-demo.queue().launch(debug=True, server_name='0.0.0.0')

+# demo inspired by https://huggingface.co/spaces/lambdalabs/image-mixer-demo
+import argparse
+import copy
 import os
+import shlex
+import subprocess
+from functools import partial
+from itertools import chain
+import cv2
 import gradio as gr
 import torch
+from basicsr.utils import tensor2img
 from huggingface_hub import hf_hub_url
+from pytorch_lightning import seed_everything
+from torch import autocast
+from ldm.inference_base import (DEFAULT_NEGATIVE_PROMPT, diffusion_inference,
+                                get_adapters, get_sd_models)
+from ldm.modules.extra_condition import api
+from ldm.modules.extra_condition.api import (ExtraCondition,
+                                             get_adapter_feature,
+                                             get_cond_model)
+torch.set_grad_enabled(False)
+supported_cond = ['style', 'color', 'canny', 'sketch', 'openpose', 'depth']
+# download the checkpoints
 urls = {
+    'TencentARC/T2I-Adapter': [
+        'models/t2iadapter_keypose_sd14v1.pth', 'models/t2iadapter_color_sd14v1.pth',
+        'models/t2iadapter_openpose_sd14v1.pth', 'models/t2iadapter_seg_sd14v1.pth',
+        'models/t2iadapter_sketch_sd14v1.pth', 'models/t2iadapter_depth_sd14v1.pth',
+        'third-party-models/body_pose_model.pth', "models/t2iadapter_style_sd14v1.pth",
+        "models/t2iadapter_canny_sd14v1.pth", "third-party-models/table5_pidinet.pth"
+    ],
+    'runwayml/stable-diffusion-v1-5': ['v1-5-pruned-emaonly.ckpt'],
+    'andite/anything-v4.0': ['anything-v4.0-pruned.ckpt', 'anything-v4.0.vae.pt'],
 }
 if os.path.exists('models') == False:
     os.mkdir('models')
 for repo in urls:
     for file in files:
         url = hf_hub_url(repo, file)
         name_ckp = url.split('/')[-1]
+        save_path = os.path.join('models', name_ckp)
         if os.path.exists(save_path) == False:
             subprocess.run(shlex.split(f'wget {url} -O {save_path}'))
+# config
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--sd_ckpt',
+    type=str,
+    default='models/v1-5-pruned-emaonly.ckpt',
+    help='path to checkpoint of stable diffusion model, both .ckpt and .safetensor are supported',
+)
+parser.add_argument(
+    '--vae_ckpt',
+    type=str,
+    default=None,
+    help='vae checkpoint, anime SD models usually have seperate vae ckpt that need to be loaded',
+)
+global_opt = parser.parse_args()
+global_opt.config = 'configs/stable-diffusion/sd-v1-inference.yaml'
+for cond_name in supported_cond:
+    setattr(global_opt, f'{cond_name}_adapter_ckpt', f'models/t2iadapter_{cond_name}_sd14v1.pth')
+global_opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+global_opt.max_resolution = 512 * 512
+global_opt.sampler = 'ddim'
+global_opt.cond_weight = 1.0
+global_opt.C = 4
+global_opt.f = 8
+# stable-diffusion model
+sd_model, sampler = get_sd_models(global_opt)
+# adapters and models to processing condition inputs
+adapters = {}
+cond_models = {}
+torch.cuda.empty_cache()
+def run(*args):
+    with torch.inference_mode(), \
+            sd_model.ema_scope(), \
+            autocast('cuda'):
+        inps = []
+        for i in range(0, len(args) - 8, len(supported_cond)):
+            inps.append(args[i:i + len(supported_cond)])
+        opt = copy.deepcopy(global_opt)
+        opt.prompt, opt.neg_prompt, opt.scale, opt.n_samples, opt.seed, opt.steps, opt.resize_short_edge, opt.cond_tau \
+            = args[-8:]
+        conds = []
+        activated_conds = []
+        ims1 = []
+        ims2 = []
+        for idx, (b, im1, im2, cond_weight) in enumerate(zip(*inps)):
+            if idx > 1:
+                if im1 is not None or im2 is not None:
+                    if im1 is not None:
+                        h, w, _ = im1.shape
+                    else:
+                        h, w, _ = im2.shape
+                    break
+        # resize all the images to the same size
+        for idx, (b, im1, im2, cond_weight) in enumerate(zip(*inps)):
+            if idx == 0:
+                ims1.append(im1)
+                ims2.append(im2)
+                continue
+            if im1 is not None:
+                im1 = cv2.resize(im1, (w, h), interpolation=cv2.INTER_CUBIC)
+            if im2 is not None:
+                im2 = cv2.resize(im2, (w, h), interpolation=cv2.INTER_CUBIC)
+            ims1.append(im1)
+            ims2.append(im2)
+        for idx, (b, _, _, cond_weight) in enumerate(zip(*inps)):
+            cond_name = supported_cond[idx]
+            if b == 'Nothing':
+                if cond_name in adapters:
+                    adapters[cond_name]['model'] = adapters[cond_name]['model'].cpu()
+            else:
+                activated_conds.append(cond_name)
+                if cond_name in adapters:
+                    adapters[cond_name]['model'] = adapters[cond_name]['model'].to(opt.device)
+                else:
+                    adapters[cond_name] = get_adapters(opt, getattr(ExtraCondition, cond_name))
+                adapters[cond_name]['cond_weight'] = cond_weight
+                process_cond_module = getattr(api, f'get_cond_{cond_name}')
+                if b == 'Image':
+                    if cond_name not in cond_models:
+                        cond_models[cond_name] = get_cond_model(opt, getattr(ExtraCondition, cond_name))
+                    conds.append(process_cond_module(opt, ims1[idx], 'image', cond_models[cond_name]))
+                else:
+                    conds.append(process_cond_module(opt, ims2[idx], cond_name, None))
+        adapter_features, append_to_context = get_adapter_feature(
+            conds, [adapters[cond_name] for cond_name in activated_conds])
+        output_conds = []
+        for cond in conds:
+            output_conds.append(tensor2img(cond, rgb2bgr=False))
+        ims = []
+        seed_everything(opt.seed)
+        for _ in range(opt.n_samples):
+            result = diffusion_inference(opt, sd_model, sampler, adapter_features, append_to_context)
+            ims.append(tensor2img(result, rgb2bgr=False))
+        # Clear GPU memory cache so less likely to OOM
+        torch.cuda.empty_cache()
+        return ims, output_conds
+def change_visible(im1, im2, val):
+    outputs = {}
+    if val == "Image":
+        outputs[im1] = gr.update(visible=True)
+        outputs[im2] = gr.update(visible=False)
+    elif val == "Nothing":
+        outputs[im1] = gr.update(visible=False)
+        outputs[im2] = gr.update(visible=False)
+    else:
+        outputs[im1] = gr.update(visible=False)
+        outputs[im2] = gr.update(visible=True)
+    return outputs
+DESCRIPTION = '# [Composable T2I-Adapter](https://github.com/TencentARC/T2I-Adapter)'
+DESCRIPTION += f'<p>Gradio demo for **T2I-Adapter**: [[GitHub]](https://github.com/TencentARC/T2I-Adapter), [[Paper]](https://arxiv.org/abs/2302.08453). If T2I-Adapter is helpful, please help to ⭐ the [Github Repo](https://github.com/TencentARC/T2I-Adapter) and recommend it to your friends 😊 </p>'
+DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/Adapter/T2I-Adapter?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
+    btns = []
+    ims1 = []
+    ims2 = []
+    cond_weights = []
+    with gr.Row():
+        with gr.Column(scale=1.9):
+            with gr.Box():
+                gr.Markdown("<h5><center>Style & Color</center></h5>")
+                with gr.Row():
+                    for cond_name in supported_cond[:2]:
+                        with gr.Box():
+                            with gr.Column():
+                                if cond_name == 'style':
+                                    btn1 = gr.Radio(
+                                        choices=["Image", "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                else:
+                                    btn1 = gr.Radio(
+                                        choices=["Image", cond_name, "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                im1 = gr.Image(
+                                    source='upload', label="Image", interactive=True, visible=False, type="numpy")
+                                im2 = gr.Image(
+                                    source='upload', label=cond_name, interactive=True, visible=False, type="numpy")
+                                cond_weight = gr.Slider(
+                                    label="Condition weight",
+                                    minimum=0,
+                                    maximum=5,
+                                    step=0.05,
+                                    value=1,
+                                    interactive=True)
+                                fn = partial(change_visible, im1, im2)
+                                btn1.change(fn=fn, inputs=[btn1], outputs=[im1, im2], queue=False)
+                                btns.append(btn1)
+                                ims1.append(im1)
+                                ims2.append(im2)
+                                cond_weights.append(cond_weight)
+        with gr.Column(scale=4):
+            with gr.Box():
+                gr.Markdown("<h5><center>Structure</center></h5>")
+                with gr.Row():
+                    for cond_name in supported_cond[2:6]:
+                        with gr.Box():
+                            with gr.Column():
+                                if cond_name == 'openpose':
+                                    btn1 = gr.Radio(
+                                        choices=["Image", 'pose', "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                else:
+                                    btn1 = gr.Radio(
+                                        choices=["Image", cond_name, "Nothing"],
+                                        label=f"Input type for {cond_name}",
+                                        interactive=True,
+                                        value="Nothing",
+                                    )
+                                im1 = gr.Image(
+                                    source='upload', label="Image", interactive=True, visible=False, type="numpy")
+                                im2 = gr.Image(
+                                    source='upload', label=cond_name, interactive=True, visible=False, type="numpy")
+                                cond_weight = gr.Slider(
+                                    label="Condition weight",
+                                    minimum=0,
+                                    maximum=5,
+                                    step=0.05,
+                                    value=1,
+                                    interactive=True)
+                                fn = partial(change_visible, im1, im2)
+                                btn1.change(fn=fn, inputs=[btn1], outputs=[im1, im2], queue=False)
+                                btns.append(btn1)
+                                ims1.append(im1)
+                                ims2.append(im2)
+                                cond_weights.append(cond_weight)
+    with gr.Column():
+        prompt = gr.Textbox(label="Prompt")
+        with gr.Accordion('Advanced options', open=False):
+            neg_prompt = gr.Textbox(label="Negative Prompt", value=DEFAULT_NEGATIVE_PROMPT)
+            scale = gr.Slider(
+                label="Guidance Scale (Classifier free guidance)", value=7.5, minimum=1, maximum=20, step=0.1)
+            n_samples = gr.Slider(label="Num samples", value=1, minimum=1, maximum=8, step=1)
+            seed = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
+            steps = gr.Slider(label="Steps", value=50, minimum=10, maximum=100, step=1)
+            resize_short_edge = gr.Slider(label="Image resolution", value=512, minimum=320, maximum=1024, step=1)
+            cond_tau = gr.Slider(
+                label="timestamp parameter that determines until which step the adapter is applied",
+                value=1.0,
+                minimum=0.1,
+                maximum=1.0,
+                step=0.05)
+    with gr.Row():
+        submit = gr.Button("Generate")
+    output = gr.Gallery().style(grid=2, height='auto')
+    cond = gr.Gallery().style(grid=2, height='auto')
+    inps = list(chain(btns, ims1, ims2, cond_weights))
+    inps.extend([prompt, neg_prompt, scale, n_samples, seed, steps, resize_short_edge, cond_tau])
+    submit.click(fn=run, inputs=inps, outputs=[output, cond])
+demo.launch(server_name='0.0.0.0', share=False, server_port=47313)

{models → configs/mm}/faster_rcnn_r50_fpn_coco.py RENAMED Viewed

@@ -1,182 +1,182 @@
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
-# optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[8, 11])
-total_epochs = 12
-model = dict(
-    type='FasterRCNN',
-    pretrained='torchvision://resnet50',
-    backbone=dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=True),
-        norm_eval=True,
-        style='pytorch'),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        num_outs=5),
-    rpn_head=dict(
-        type='RPNHead',
-        in_channels=256,
-        feat_channels=256,
-        anchor_generator=dict(
-            type='AnchorGenerator',
-            scales=[8],
-            ratios=[0.5, 1.0, 2.0],
-            strides=[4, 8, 16, 32, 64]),
-        bbox_coder=dict(
-            type='DeltaXYWHBBoxCoder',
-            target_means=[.0, .0, .0, .0],
-            target_stds=[1.0, 1.0, 1.0, 1.0]),
-        loss_cls=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
-    roi_head=dict(
-        type='StandardRoIHead',
-        bbox_roi_extractor=dict(
-            type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-            out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
-        bbox_head=dict(
-            type='Shared2FCBBoxHead',
-            in_channels=256,
-            fc_out_channels=1024,
-            roi_feat_size=7,
-            num_classes=80,
-            bbox_coder=dict(
-                type='DeltaXYWHBBoxCoder',
-                target_means=[0., 0., 0., 0.],
-                target_stds=[0.1, 0.1, 0.2, 0.2]),
-            reg_class_agnostic=False,
-            loss_cls=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
-    # model training and testing settings
-    train_cfg=dict(
-        rpn=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.7,
-                neg_iou_thr=0.3,
-                min_pos_iou=0.3,
-                match_low_quality=True,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='RandomSampler',
-                num=256,
-                pos_fraction=0.5,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=False),
-            allowed_border=-1,
-            pos_weight=-1,
-            debug=False),
-        rpn_proposal=dict(
-            nms_pre=2000,
-            max_per_img=1000,
-            nms=dict(type='nms', iou_threshold=0.7),
-            min_bbox_size=0),
-        rcnn=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.5,
-                neg_iou_thr=0.5,
-                min_pos_iou=0.5,
-                match_low_quality=False,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='RandomSampler',
-                num=512,
-                pos_fraction=0.25,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=True),
-            pos_weight=-1,
-            debug=False)),
-    test_cfg=dict(
-        rpn=dict(
-            nms_pre=1000,
-            max_per_img=1000,
-            nms=dict(type='nms', iou_threshold=0.7),
-            min_bbox_size=0),
-        rcnn=dict(
-            score_thr=0.05,
-            nms=dict(type='nms', iou_threshold=0.5),
-            max_per_img=100)
-        # soft-nms is also supported for rcnn testing
-        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
-    ))
-dataset_type = 'CocoDataset'
-data_root = 'data/coco'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='DefaultFormatBundle'),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_train2017.json',
-        img_prefix=f'{data_root}/train2017/',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        pipeline=test_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=f'{data_root}/annotations/instances_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        pipeline=test_pipeline))
-evaluation = dict(interval=1, metric='bbox')

+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')

{models → configs/mm}/hrnet_w48_coco_256x192.py RENAMED Viewed

@@ -1,169 +1,169 @@
-# _base_ = [
-#     '../../../../_base_/default_runtime.py',
-#     '../../../../_base_/datasets/coco.py'
-# ]
-evaluation = dict(interval=10, metric='mAP', save_best='AP')
-optimizer = dict(
-    type='Adam',
-    lr=5e-4,
-)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[170, 200])
-total_epochs = 210
-channel_cfg = dict(
-    num_output_channels=17,
-    dataset_joints=17,
-    dataset_channel=[
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-    ],
-    inference_channel=[
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-    ])
-# model settings
-model = dict(
-    type='TopDown',
-    pretrained='https://download.openmmlab.com/mmpose/'
-    'pretrain_models/hrnet_w48-8ef0771d.pth',
-    backbone=dict(
-        type='HRNet',
-        in_channels=3,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(48, 96)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(48, 96, 192)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(48, 96, 192, 384))),
-    ),
-    keypoint_head=dict(
-        type='TopdownHeatmapSimpleHead',
-        in_channels=48,
-        out_channels=channel_cfg['num_output_channels'],
-        num_deconv_layers=0,
-        extra=dict(final_conv_kernel=1, ),
-        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
-    train_cfg=dict(),
-    test_cfg=dict(
-        flip_test=True,
-        post_process='default',
-        shift_heatmap=True,
-        modulate_kernel=11))
-data_cfg = dict(
-    image_size=[192, 256],
-    heatmap_size=[48, 64],
-    num_output_channels=channel_cfg['num_output_channels'],
-    num_joints=channel_cfg['dataset_joints'],
-    dataset_channel=channel_cfg['dataset_channel'],
-    inference_channel=channel_cfg['inference_channel'],
-    soft_nms=False,
-    nms_thr=1.0,
-    oks_thr=0.9,
-    vis_thr=0.2,
-    use_gt_bbox=False,
-    det_bbox_thr=0.0,
-    bbox_file='data/coco/person_detection_results/'
-    'COCO_val2017_detections_AP_H_56_person.json',
-)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='TopDownGetBboxCenterScale', padding=1.25),
-    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
-    dict(type='TopDownRandomFlip', flip_prob=0.5),
-    dict(
-        type='TopDownHalfBodyTransform',
-        num_joints_half_body=8,
-        prob_half_body=0.3),
-    dict(
-        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
-    dict(type='TopDownAffine'),
-    dict(type='ToTensor'),
-    dict(
-        type='NormalizeTensor',
-        mean=[0.485, 0.456, 0.406],
-        std=[0.229, 0.224, 0.225]),
-    dict(type='TopDownGenerateTarget', sigma=2),
-    dict(
-        type='Collect',
-        keys=['img', 'target', 'target_weight'],
-        meta_keys=[
-            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
-            'rotation', 'bbox_score', 'flip_pairs'
-        ]),
-]
-val_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='TopDownGetBboxCenterScale', padding=1.25),
-    dict(type='TopDownAffine'),
-    dict(type='ToTensor'),
-    dict(
-        type='NormalizeTensor',
-        mean=[0.485, 0.456, 0.406],
-        std=[0.229, 0.224, 0.225]),
-    dict(
-        type='Collect',
-        keys=['img'],
-        meta_keys=[
-            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
-            'flip_pairs'
-        ]),
-]
-test_pipeline = val_pipeline
-data_root = 'data/coco'
-data = dict(
-    samples_per_gpu=32,
-    workers_per_gpu=2,
-    val_dataloader=dict(samples_per_gpu=32),
-    test_dataloader=dict(samples_per_gpu=32),
-    train=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
-        img_prefix=f'{data_root}/train2017/',
-        data_cfg=data_cfg,
-        pipeline=train_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-    val=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        data_cfg=data_cfg,
-        pipeline=val_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-    test=dict(
-        type='TopDownCocoDataset',
-        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
-        img_prefix=f'{data_root}/val2017/',
-        data_cfg=data_cfg,
-        pipeline=test_pipeline,
-        dataset_info={{_base_.dataset_info}}),
-)

+# _base_ = [
+#     '../../../../_base_/default_runtime.py',
+#     '../../../../_base_/datasets/coco.py'
+# ]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+test_pipeline = val_pipeline
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)

configs/stable-diffusion/sd-v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.WebUIFrozenCLIPEmebedder
+      params:
+        version: openai/clip-vit-large-patch14
+        layer: last

configs/stable-diffusion/sd-v1-train.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4

configs/stable-diffusion/train_keypose.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+name: train_keypose
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4

configs/stable-diffusion/train_mask.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+name: train_mask
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4

configs/stable-diffusion/train_sketch.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+name: train_sketch
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: #__is_unconditional__
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        version: openai/clip-vit-large-patch14
+logger:
+  print_freq: 100
+  save_checkpoint_freq: !!float 1e4
+  use_tb_logger: true
+  wandb:
+    project: ~
+    resume_id: ~
+dist_params:
+  backend: nccl
+  port: 29500
+training:
+  lr: !!float 1e-5
+  save_freq: 1e4

demo/demos.py DELETED Viewed

@@ -1,309 +0,0 @@
-import gradio as gr
-import numpy as np
-import psutil
-def create_map():
-    return np.zeros(shape=(512, 512), dtype=np.uint8)+255
-def get_system_memory():
-    memory = psutil.virtual_memory()
-    memory_percent = memory.percent
-    memory_used = memory.used / (1024.0 ** 3)
-    memory_total = memory.total / (1024.0 ** 3)
-    return {"percent": f"{memory_percent}%", "used": f"{memory_used:.3f}GB", "total": f"{memory_total:.3f}GB"}
-def create_demo_keypose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Keypose)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Keypose', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a keypose map)')
-                    fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed to produce a fixed output)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the keypose to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-        ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_openpose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Openpose)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Openpose', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a openpose map)')
-                    fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed to produce a fixed output)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the openpose to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-        ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Sketch)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_canny(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Canny)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Canny', 'Image'], type="value", default='Image', label='Input Types\n (You can input an image or a canny map)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the canny background\n (Only work for canny input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the canny to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_color_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Color + Sketch)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_sketch = gr.Image(source='upload', type="numpy", label='Sketch guidance')
-                    input_img_color = gr.Image(source='upload', type="numpy", label='Color guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                type_in_color = gr.inputs.Radio(['ColorMap', 'Image'], type="value", default='Image', label='Input Types of Color\n (You can input an image or a color map)')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types of Sketch\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                with gr.Row():
-                    w_sketch = gr.Slider(label="Sketch guidance weight", minimum=0, maximum=2, value=1.0, step=0.1)
-                    w_color = gr.Slider(label="Color guidance weight", minimum=0, maximum=2, value=1.2, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=3, height='auto')
-            ips = [input_img_sketch, input_img_color, type_in, type_in_color, w_sketch, w_color, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_style_sketch(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Style + Sketch)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_sketch = gr.Image(source='upload', type="numpy", label='Sketch guidance')
-                    input_img_style = gr.Image(source='upload', type="numpy", label='Style guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Sketch', 'Image'], type="value", default='Image', label='Input Types of Sketch\n (You can input an image or a sketch)')
-                    color_back = gr.inputs.Radio(['White', 'Black'], type="value", default='Black', label='Color of the sketch background\n (Only work for sketch input)')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img_sketch, input_img_style, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_color(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Color)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy", label='Color guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                type_in_color = gr.inputs.Radio(['ColorMap', 'Image'], type="value", default='Image', label='Input Types of Color\n (You can input an image or a color map)')
-                w_color = gr.Slider(label="Color guidance weight", minimum=0, maximum=2, value=1, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, prompt, neg_prompt, pos_prompt, w_color, type_in_color, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_seg(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Segmentation)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Segmentation', 'Image'], type="value", default='Image', label='You can input an image or a segmentation. If you choose to input a segmentation, it must correspond to the coco-stuff')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the segmentation to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_depth(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Depth)')
-        with gr.Row():
-            with gr.Column():
-                input_img = gr.Image(source='upload', type="numpy")
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in = gr.inputs.Radio(['Depth', 'Image'], type="value", default='Image', label='You can input an image or a depth map')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the depth map to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_depth_keypose(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Depth & Keypose)')
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    input_img_depth = gr.Image(source='upload', type="numpy", label='Depth guidance')
-                    input_img_keypose = gr.Image(source='upload', type="numpy", label='Keypose guidance')
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                with gr.Row():
-                    type_in_depth = gr.inputs.Radio(['Depth', 'Image'], type="value", default='Image', label='You can input an image or a depth map')
-                    type_in_keypose = gr.inputs.Radio(['Keypose', 'Image'], type="value", default='Image', label='You can input an image or a keypose map (mmpose style)')
-                with gr.Row():
-                    w_depth = gr.Slider(label="Depth guidance weight", minimum=0, maximum=2, value=1.0, step=0.1)
-                    w_keypose = gr.Slider(label="Keypose guidance weight", minimum=0, maximum=2, value=1.5, step=0.1)
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the multi-guidance to the result)", minimum=0, maximum=1, value=1, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=3, height='auto')
-            ips = [input_img_depth, input_img_keypose, type_in_depth, type_in_keypose, w_depth, w_keypose, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo
-def create_demo_draw(process):
-    with gr.Blocks() as demo:
-        with gr.Row():
-            gr.Markdown('## T2I-Adapter (Hand-free drawing)')
-        with gr.Row():
-            with gr.Column():
-                create_button = gr.Button(label="Start", value='Hand-free drawing')
-                input_img = gr.Image(source='upload', type="numpy",tool='sketch')
-                create_button.click(fn=create_map, outputs=[input_img], queue=False)
-                prompt = gr.Textbox(label="Prompt")
-                neg_prompt = gr.Textbox(label="Negative Prompt",
-                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
-                pos_prompt = gr.Textbox(label="Positive Prompt",
-                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
-                run_button = gr.Button(label="Run")
-                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the sketch to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
-                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
-                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
-                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
-            with gr.Column():
-                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-            ips = [input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
-        run_button.click(fn=process, inputs=ips, outputs=[result])
-    return demo

demo/model.py DELETED Viewed

@@ -1,979 +0,0 @@
-import torch
-from basicsr.utils import img2tensor, tensor2img
-from pytorch_lightning import seed_everything
-from ldm.models.diffusion.plms import PLMSSampler
-from ldm.modules.encoders.adapter import Adapter, Adapter_light, StyleAdapter
-from ldm.util import instantiate_from_config
-from ldm.modules.structure_condition.model_edge import pidinet
-from ldm.modules.structure_condition.model_seg import seger, Colorize
-from ldm.modules.structure_condition.midas.api import MiDaSInference
-import gradio as gr
-from omegaconf import OmegaConf
-import mmcv
-from mmdet.apis import inference_detector, init_detector
-from mmpose.apis import (inference_top_down_pose_model, init_pose_model, process_mmdet_results, vis_pose_result)
-import os
-import cv2
-import numpy as np
-import torch.nn.functional as F
-from transformers import CLIPProcessor, CLIPVisionModel
-from PIL import Image
-def preprocessing(image, device):
-    # Resize
-    scale = 640 / max(image.shape[:2])
-    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
-    raw_image = image.astype(np.uint8)
-    # Subtract mean values
-    image = image.astype(np.float32)
-    image -= np.array(
-        [
-            float(104.008),
-            float(116.669),
-            float(122.675),
-        ]
-    )
-    # Convert to torch.Tensor and add "batch" axis
-    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
-    image = image.to(device)
-    return image, raw_image
-def imshow_keypoints(img,
-                     pose_result,
-                     skeleton=None,
-                     kpt_score_thr=0.1,
-                     pose_kpt_color=None,
-                     pose_link_color=None,
-                     radius=4,
-                     thickness=1):
-    """Draw keypoints and links on an image.
-    Args:
-            img (ndarry): The image to draw poses on.
-            pose_result (list[kpts]): The poses to draw. Each element kpts is
-                a set of K keypoints as an Kx3 numpy.ndarray, where each
-                keypoint is represented as x, y, score.
-            kpt_score_thr (float, optional): Minimum score of keypoints
-                to be shown. Default: 0.3.
-            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
-                the keypoint will not be drawn.
-            pose_link_color (np.array[Mx3]): Color of M links. If None, the
-                links will not be drawn.
-            thickness (int): Thickness of lines.
-    """
-    img_h, img_w, _ = img.shape
-    img = np.zeros(img.shape)
-    for idx, kpts in enumerate(pose_result):
-        if idx > 1:
-            continue
-        kpts = kpts['keypoints']
-        kpts = np.array(kpts, copy=False)
-        # draw each point on image
-        if pose_kpt_color is not None:
-            assert len(pose_kpt_color) == len(kpts)
-            for kid, kpt in enumerate(kpts):
-                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
-                if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
-                    # skip the point that should not be drawn
-                    continue
-                color = tuple(int(c) for c in pose_kpt_color[kid])
-                cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1)
-        # draw links
-        if skeleton is not None and pose_link_color is not None:
-            assert len(pose_link_color) == len(skeleton)
-            for sk_id, sk in enumerate(skeleton):
-                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
-                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
-                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0 or pos1[1] >= img_h or pos2[0] <= 0
-                        or pos2[0] >= img_w or pos2[1] <= 0 or pos2[1] >= img_h or kpts[sk[0], 2] < kpt_score_thr
-                        or kpts[sk[1], 2] < kpt_score_thr or pose_link_color[sk_id] is None):
-                    # skip the link that should not be drawn
-                    continue
-                color = tuple(int(c) for c in pose_link_color[sk_id])
-                cv2.line(img, pos1, pos2, color, thickness=thickness)
-    return img
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    if "state_dict" in pl_sd:
-        sd = pl_sd["state_dict"]
-    else:
-        sd = pl_sd
-    model = instantiate_from_config(config.model)
-    _, _ = model.load_state_dict(sd, strict=False)
-    model.cuda()
-    model.eval()
-    return model
-class Model_all:
-    def __init__(self, device='cpu'):
-        # common part
-        self.device = device
-        self.config = OmegaConf.load("configs/stable-diffusion/app.yaml")
-        self.config.model.params.cond_stage_config.params.device = device
-        self.base_model = load_model_from_config(self.config, "models/sd-v1-4.ckpt").to(device)
-        self.current_base = 'sd-v1-4.ckpt'
-        self.sampler = PLMSSampler(self.base_model)
-        # sketch part
-        self.model_canny = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                    use_conv=False).to(device)
-        self.model_canny.load_state_dict(torch.load("models/t2iadapter_canny_sd14v1.pth", map_location=device))
-        self.model_sketch = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                    use_conv=False).to(device)
-        self.model_sketch.load_state_dict(torch.load("models/t2iadapter_sketch_sd14v1.pth", map_location=device))
-        self.model_edge = pidinet().to(device)
-        self.model_edge.load_state_dict({k.replace('module.', ''): v for k, v in
-                                         torch.load('models/table5_pidinet.pth', map_location=device)[
-                                             'state_dict'].items()})
-        # segmentation part
-        self.model_seger = seger().to(device)
-        self.model_seger.eval()
-        self.coler = Colorize(n=182)
-        self.model_seg = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                 use_conv=False).to(device)
-        self.model_seg.load_state_dict(torch.load("models/t2iadapter_seg_sd14v1.pth", map_location=device))
-        # depth part
-        self.depth_model = MiDaSInference(model_type='dpt_hybrid').to(device)
-        self.model_depth = Adapter(cin=3 * 64, channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                   use_conv=False).to(device)
-        self.model_depth.load_state_dict(torch.load("models/t2iadapter_depth_sd14v1.pth", map_location=device))
-        # keypose part
-        self.model_pose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                  use_conv=False).to(device)
-        self.model_pose.load_state_dict(torch.load("models/t2iadapter_keypose_sd14v1.pth", map_location=device))
-        # openpose part
-        self.model_openpose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
-                                  use_conv=False).to(device)
-        self.model_openpose.load_state_dict(torch.load("models/t2iadapter_openpose_sd14v1.pth", map_location=device))
-        # color part
-        self.model_color = Adapter_light(cin=int(3 * 64), channels=[320, 640, 1280, 1280], nums_rb=4).to(device)
-        self.model_color.load_state_dict(torch.load("models/t2iadapter_color_sd14v1.pth", map_location=device))
-        # style part
-        self.model_style = StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8).to(device)
-        self.model_style.load_state_dict(torch.load("models/t2iadapter_style_sd14v1.pth", map_location=device))
-        self.clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
-        self.clip_vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14').to(device)
-        device = 'cpu'
-        ## mmpose
-        det_config = 'models/faster_rcnn_r50_fpn_coco.py'
-        det_checkpoint = 'models/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
-        pose_config = 'models/hrnet_w48_coco_256x192.py'
-        pose_checkpoint = 'models/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
-        self.det_cat_id = 1
-        self.bbox_thr = 0.2
-        ## detector
-        det_config_mmcv = mmcv.Config.fromfile(det_config)
-        self.det_model = init_detector(det_config_mmcv, det_checkpoint, device=device)
-        pose_config_mmcv = mmcv.Config.fromfile(pose_config)
-        self.pose_model = init_pose_model(pose_config_mmcv, pose_checkpoint, device=device)
-        ## color
-        self.skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8],
-                         [7, 9], [8, 10],
-                         [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]
-        self.pose_kpt_color = [[51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255],
-                               [0, 255, 0],
-                               [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0],
-                               [255, 128, 0],
-                               [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0]]
-        self.pose_link_color = [[0, 255, 0], [0, 255, 0], [255, 128, 0], [255, 128, 0],
-                                [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0],
-                                [255, 128, 0],
-                                [0, 255, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255],
-                                [51, 153, 255],
-                                [51, 153, 255], [51, 153, 255], [51, 153, 255]]
-    def load_vae(self):
-        vae_sd = torch.load(os.path.join('models', 'anything-v4.0.vae.pt'), map_location="cuda")
-        sd = vae_sd["state_dict"]
-        self.base_model.first_stage_model.load_state_dict(sd, strict=False)
-    @torch.no_grad()
-    def process_sketch(self, input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                       con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_edge, x_samples_ddim]
-    @torch.no_grad()
-    def process_canny(self, input_img, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                       con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Canny':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-        elif type_in == 'Image':
-            im = cv2.Canny(im,100,200)
-            im = img2tensor(im[..., None], bgr2rgb=True, float32=True).unsqueeze(0) / 255.
-            im_edge = tensor2img(im)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_canny(im.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_edge, x_samples_ddim]
-    @torch.no_grad()
-    def process_color_sketch(self, input_img_sketch, input_img_color, type_in, type_in_color, w_sketch, w_color, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img_sketch, (512, 512))
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]#.cuda()
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-        if type_in_color == 'Image':
-            input_img_color = cv2.resize(input_img_color,(512//64, 512//64), interpolation=cv2.INTER_CUBIC)
-            input_img_color = cv2.resize(input_img_color,(512,512), interpolation=cv2.INTER_NEAREST)
-        else:
-            input_img_color = cv2.resize(input_img_color, (512, 512))
-        im_color = input_img_color.copy()
-        im_color_tensor = img2tensor(input_img_color, bgr2rgb=False).unsqueeze(0) / 255.
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter_sketch = self.model_sketch(im.to(self.device))
-        features_adapter_color = self.model_color(im_color_tensor.to(self.device))
-        features_adapter = [fs*w_sketch+fc*w_color for fs, fc in zip(features_adapter_sketch,features_adapter_color)]
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_edge, im_color, x_samples_ddim]
-    @torch.no_grad()
-    def process_style_sketch(self, input_img_sketch, input_img_style, type_in, color_back, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img_sketch, (512, 512))
-        if type_in == 'Sketch':
-            if color_back == 'White':
-                im = 255 - im
-            im_edge = im.copy()
-            im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-            im = im > 0.5
-            im = im.float()
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 255.
-            im = self.model_edge(im.to(self.device))[-1]#.cuda()
-            im = im > 0.5
-            im = im.float()
-            im_edge = tensor2img(im)
-        style = Image.fromarray(input_img_style)
-        style_for_clip = self.clip_processor(images=style, return_tensors="pt")['pixel_values']
-        style_feat = self.clip_vision_model(style_for_clip.to(self.device))['last_hidden_state']
-        style_feat = self.model_style(style_feat)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='style',
-                                              con_strength=con_strength,
-                                              style_feature=style_feat)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_edge, x_samples_ddim]
-    @torch.no_grad()
-    def process_color(self, input_img, prompt, neg_prompt, pos_prompt, w_color, type_in_color, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        if type_in_color == 'Image':
-            input_img = cv2.resize(input_img,(512//64, 512//64), interpolation=cv2.INTER_CUBIC)
-            input_img = cv2.resize(input_img,(512,512), interpolation=cv2.INTER_NEAREST)
-        else:
-            input_img = cv2.resize(input_img, (512, 512))
-        im_color = input_img.copy()
-        im = img2tensor(input_img, bgr2rgb=False).unsqueeze(0) / 255.
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_color(im.to(self.device))
-        features_adapter = [fi*w_color for fi in features_adapter]
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_color, x_samples_ddim]
-    @torch.no_grad()
-    def process_depth(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                      con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Depth':
-            im_depth = im.copy()
-            depth = img2tensor(im).unsqueeze(0) / 255.
-        elif type_in == 'Image':
-            im = img2tensor(im).unsqueeze(0) / 127.5 - 1.0
-            depth = self.depth_model(im.to(self.device)).repeat(1, 3, 1, 1)
-            depth -= torch.min(depth)
-            depth /= torch.max(depth)
-            im_depth = tensor2img(depth)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_depth(depth.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_depth, x_samples_ddim]
-    @torch.no_grad()
-    def process_depth_keypose(self, input_img_depth, input_img_keypose, type_in_depth, type_in_keypose, w_depth,
-                              w_keypose, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        if fix_sample == 'True':
-            seed_everything(42)
-        im_depth = cv2.resize(input_img_depth, (512, 512))
-        im_keypose = cv2.resize(input_img_keypose, (512, 512))
-        # get depth
-        if type_in_depth == 'Depth':
-            im_depth_out = im_depth.copy()
-            depth = img2tensor(im_depth).unsqueeze(0) / 255.
-        elif type_in_depth == 'Image':
-            im_depth = img2tensor(im_depth).unsqueeze(0) / 127.5 - 1.0
-            depth = self.depth_model(im_depth.to(self.device)).repeat(1, 3, 1, 1)
-            depth -= torch.min(depth)
-            depth /= torch.max(depth)
-            im_depth_out = tensor2img(depth)
-        # get keypose
-        if type_in_keypose == 'Keypose':
-            im_keypose_out = im_keypose.copy()[:,:,::-1]
-        elif type_in_keypose == 'Image':
-            image = im_keypose.copy()
-            im_keypose = img2tensor(im_keypose).unsqueeze(0) / 255.
-            mmdet_results = inference_detector(self.det_model, image)
-            # keep the person class bounding boxes.
-            person_results = process_mmdet_results(mmdet_results, self.det_cat_id)
-            # optional
-            return_heatmap = False
-            dataset = self.pose_model.cfg.data['test']['type']
-            # e.g. use ('backbone', ) to return backbone feature
-            output_layer_names = None
-            pose_results, _ = inference_top_down_pose_model(
-                self.pose_model,
-                image,
-                person_results,
-                bbox_thr=self.bbox_thr,
-                format='xyxy',
-                dataset=dataset,
-                dataset_info=None,
-                return_heatmap=return_heatmap,
-                outputs=output_layer_names)
-            # show the results
-            im_keypose_out = imshow_keypoints(
-                image,
-                pose_results,
-                skeleton=self.skeleton,
-                pose_kpt_color=self.pose_kpt_color,
-                pose_link_color=self.pose_link_color,
-                radius=2,
-                thickness=2)
-            im_keypose_out = im_keypose_out.astype(np.uint8)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter_depth = self.model_depth(depth.to(self.device))
-        pose = img2tensor(im_keypose_out, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter_keypose = self.model_pose(pose.to(self.device))
-        features_adapter = [f_d * w_depth + f_k * w_keypose for f_d, f_k in
-                            zip(features_adapter_depth, features_adapter_keypose)]
-        shape = [4, 64, 64]
-        # sampling
-        con_strength = int((1 - con_strength) * 50)
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_depth_out, im_keypose_out[:, :, ::-1], x_samples_ddim]
-    @torch.no_grad()
-    def process_seg(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale,
-                    con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Segmentation':
-            im_seg = im.copy()
-            im = img2tensor(im).unsqueeze(0) / 255.
-            labelmap = im.float()
-        elif type_in == 'Image':
-            im, _ = preprocessing(im, self.device)
-            _, _, H, W = im.shape
-            # Image -> Probability map
-            logits = self.model_seger(im)
-            logits = F.interpolate(logits, size=(H, W), mode="bilinear", align_corners=False)
-            probs = F.softmax(logits, dim=1)[0]
-            probs = probs.cpu().data.numpy()
-            labelmap = np.argmax(probs, axis=0)
-            labelmap = self.coler(labelmap)
-            labelmap = np.transpose(labelmap, (1, 2, 0))
-            labelmap = cv2.resize(labelmap, (512, 512))
-            labelmap = img2tensor(labelmap, bgr2rgb=False, float32=True) / 255.
-            im_seg = tensor2img(labelmap)[:, :, ::-1]
-            labelmap = labelmap.unsqueeze(0)
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_seg(labelmap.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_seg, x_samples_ddim]
-    @torch.no_grad()
-    def process_draw(self, input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        input_img = input_img['mask']
-        c = input_img[:, :, 0:3].astype(np.float32)
-        a = input_img[:, :, 3:4].astype(np.float32) / 255.0
-        im = c * a + 255.0 * (1.0 - a)
-        im = im.clip(0, 255).astype(np.uint8)
-        im = cv2.resize(im, (512, 512))
-        im_edge = im.copy()
-        im = img2tensor(im)[0].unsqueeze(0).unsqueeze(0) / 255.
-        im = im > 0.5
-        im = im.float()
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        features_adapter = self.model_sketch(im.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_edge, x_samples_ddim]
-    @torch.no_grad()
-    def process_keypose(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength,
-                        base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Keypose':
-            im_pose = im.copy()[:,:,::-1]
-        elif type_in == 'Image':
-            image = im.copy()
-            im = img2tensor(im).unsqueeze(0) / 255.
-            mmdet_results = inference_detector(self.det_model, image)
-            # keep the person class bounding boxes.
-            person_results = process_mmdet_results(mmdet_results, self.det_cat_id)
-            # optional
-            return_heatmap = False
-            dataset = self.pose_model.cfg.data['test']['type']
-            # e.g. use ('backbone', ) to return backbone feature
-            output_layer_names = None
-            pose_results, _ = inference_top_down_pose_model(
-                self.pose_model,
-                image,
-                person_results,
-                bbox_thr=self.bbox_thr,
-                format='xyxy',
-                dataset=dataset,
-                dataset_info=None,
-                return_heatmap=return_heatmap,
-                outputs=output_layer_names)
-            # show the results
-            im_pose = imshow_keypoints(
-                image,
-                pose_results,
-                skeleton=self.skeleton,
-                pose_kpt_color=self.pose_kpt_color,
-                pose_link_color=self.pose_link_color,
-                radius=2,
-                thickness=2)
-        # im_pose = cv2.resize(im_pose, (512, 512))
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        pose = img2tensor(im_pose, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter = self.model_pose(pose.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_pose[:, :, ::-1].astype(np.uint8), x_samples_ddim]
-    @torch.no_grad()
-    def process_openpose(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength,
-                        base_model):
-        if self.current_base != base_model:
-            ckpt = os.path.join("models", base_model)
-            pl_sd = torch.load(ckpt, map_location="cuda")
-            if "state_dict" in pl_sd:
-                sd = pl_sd["state_dict"]
-            else:
-                sd = pl_sd
-            self.base_model.load_state_dict(sd, strict=False)
-            self.current_base = base_model
-            if 'anything' in base_model.lower():
-                self.load_vae()
-        con_strength = int((1 - con_strength) * 50)
-        if fix_sample == 'True':
-            seed_everything(42)
-        im = cv2.resize(input_img, (512, 512))
-        if type_in == 'Openpose':
-            im_pose = im.copy()[:,:,::-1]
-        elif type_in == 'Image':
-            from ldm.modules.structure_condition.openpose.api import OpenposeInference
-            model = OpenposeInference()
-            keypose = model(im[:,:,::-1])
-            im_pose = keypose.copy()
-        # extract condition features
-        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
-        nc = self.base_model.get_learned_conditioning([neg_prompt])
-        pose = img2tensor(im_pose, bgr2rgb=True, float32=True) / 255.
-        pose = pose.unsqueeze(0)
-        features_adapter = self.model_openpose(pose.to(self.device))
-        shape = [4, 64, 64]
-        # sampling
-        samples_ddim, _ = self.sampler.sample(S=50,
-                                              conditioning=c,
-                                              batch_size=1,
-                                              shape=shape,
-                                              verbose=False,
-                                              unconditional_guidance_scale=scale,
-                                              unconditional_conditioning=nc,
-                                              eta=0.0,
-                                              x_T=None,
-                                              features_adapter1=features_adapter,
-                                              mode='sketch',
-                                              con_strength=con_strength)
-        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
-        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-        x_samples_ddim = x_samples_ddim.to('cpu')
-        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
-        x_samples_ddim = 255. * x_samples_ddim
-        x_samples_ddim = x_samples_ddim.astype(np.uint8)
-        return [im_pose[:, :, ::-1].astype(np.uint8), x_samples_ddim]
-if __name__ == '__main__':
-    model = Model_all('cpu')

dist_util.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py  # noqa: E501
+import functools
+import os
+import subprocess
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+def _init_dist_pytorch(backend, **kwargs):
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+def get_dist_info():
+    if dist.is_available():
+        initialized = dist.is_initialized()
+    else:
+        initialized = False
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
+def get_bare_model(net):
+    """Get bare model, especially under wrapping with
+    DistributedDataParallel or DataParallel.
+    """
+    if isinstance(net, (DataParallel, DistributedDataParallel)):
+        net = net.module
+    return net

docs/AdapterZoo.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Adapter Zoo
+You can download the adapters from <https://huggingface.co/TencentARC/T2I-Adapter/tree/main>
+All the following adapters are trained with Stable Diffusion (SD) V1.4, and they can be directly used on custom models as long as they are fine-tuned from the same text-to-image models, such as Anything-4.0 or models on the <https://civitai.com/>.
+| Adapter Name  | Adapter Description | Demos|Model Parameters|  Model Storage | |
+| --- | --- |--- |--- |--- |---|
+| t2iadapter_color_sd14v1.pth | Spatial color palette → image | [Demos](examples.md#color-adapter-spatial-palette) |18 M | 75 MB | |
+| t2iadapter_style_sd14v1.pth | Image style → image | [Demos](examples.md#style-adapter)|| 154MB |  Preliminary model. Style adapters with finer controls are on the way|
+| t2iadapter_openpose_sd14v1.pth | Openpose → image| [Demos](examples.md#openpose-adapter) |77 M| 309 MB | |
+| t2iadapter_canny_sd14v1.pth | Canny edges → image | [Demos](examples.md#canny-adapter-edge )|77 M | 309 MB ||
+| t2iadapter_sketch_sd14v1.pth | sketch → image ||77 M| 308 MB | |
+| t2iadapter_keypose_sd14v1.pth | keypose → image || 77 M| 309 MB | mmpose style |
+| t2iadapter_seg_sd14v1.pth | segmentation → image ||77 M| 309 MB ||
+| t2iadapter_depth_sd14v1.pth | depth maps → image ||77 M | 309 MB | Not the final model, still under training|

docs/FAQ.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# FAQ
+- **Q: The openpose adapter (t2iadapter_openpose_sd14v1) outputs gray-scale images.**
+    **A:** You can add `colorful` in the prompt to avoid this problem.

docs/examples.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# Demos
+## Style Adapter
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222734169-d47789e8-e83c-48c2-80ef-a896c2bafbb0.png" height=450>
+</p>
+## Color Adapter (Spatial Palette)
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222915829-ccfb0366-13a8-484a-9561-627fabd87d29.png" height=450>
+</p>
+## Openpose Adapter
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222733916-dc26a66e-d786-4407-8889-b81804862b1a.png" height=450>
+</p>
+## Canny Adapter (Edge)
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/222915813-c8f264bd-1be6-4496-97ff-aec4f6b53788.png" height=450>
+</p>
+## Multi-adapters
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/17445847/220939329-379f88b7-444f-4a3a-9de0-8f90605d1d34.png" height=450>
+</p>
+<div align="center">
+*T2I adapters naturally support using multiple adapters together.*
+</div><br />
+The testing script usage for this example is similar to the command line given below, except that we replaced the pretrained SD model with Anything 4.5 and Kenshi
+>python test_composable_adapters.py --prompt "1gril, computer desk, best quality, extremely detailed" --neg_prompt "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality" --depth_cond_path examples/depth/desk_depth.png --depth_cond_weight 1.0 --depth_ckpt models/t2iadapter_depth_sd14v1.pth --depth_type_in depth --pose_cond_path examples/keypose/person_keypose.png --pose_cond_weight 1.5 --ckpt models/anything-v4.0-pruned.ckpt --n_sample 4 --max_resolution 524288
+[Image source](https://twitter.com/toyxyz3/status/1628375164781211648)

environment.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-name: ldm
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - python=3.8.5
-  - pip=20.3
-  - cudatoolkit=11.3
-  - pytorch=1.11.0
-  - torchvision=0.12.0
-  - numpy=1.19.2
-  - pip:
-    - albumentations==0.4.3
-    - diffusers
-    - opencv-python==4.1.2.30
-    - pudb==2019.2
-    - invisible-watermark
-    - imageio==2.9.0
-    - imageio-ffmpeg==0.4.2
-    - pytorch-lightning==1.4.2
-    - omegaconf==2.1.1
-    - test-tube>=0.7.5
-    - streamlit>=0.73.1
-    - einops==0.3.0
-    - torch-fidelity==0.3.0
-    - transformers==4.19.2
-    - torchmetrics==0.6.0
-    - kornia==0.6
-    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
-    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
-    - -e .

ldm/modules/structure_condition/midas/__init__.py → experiments/README.md RENAMED Viewed

File without changes

ldm/data/base.py DELETED Viewed

@@ -1,23 +0,0 @@
-from abc import abstractmethod
-from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
-class Txt2ImgIterableBaseDataset(IterableDataset):
-    '''
-    Define an interface to make the IterableDatasets for text2img data chainable
-    '''
-    def __init__(self, num_records=0, valid_ids=None, size=256):
-        super().__init__()
-        self.num_records = num_records
-        self.valid_ids = valid_ids
-        self.sample_ids = valid_ids
-        self.size = size
-        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
-    def __len__(self):
-        return self.num_records
-    @abstractmethod
-    def __iter__(self):
-        pass

ldm/data/dataset_coco.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+import cv2
+import os
+from basicsr.utils import img2tensor
+class dataset_coco_mask_color():
+    def __init__(self, path_json, root_path_im, root_path_mask, image_size):
+        super(dataset_coco_mask_color, self).__init__()
+        with open(path_json, 'r', encoding='utf-8') as fp:
+            data = json.load(fp)
+        data = data['annotations']
+        self.files = []
+        self.root_path_im = root_path_im
+        self.root_path_mask = root_path_mask
+        for file in data:
+            name = "%012d.png" % file['image_id']
+            self.files.append({'name': name, 'sentence': file['caption']})
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        name = file['name']
+        # print(os.path.join(self.root_path_im, name))
+        im = cv2.imread(os.path.join(self.root_path_im, name.replace('.png', '.jpg')))
+        im = cv2.resize(im, (512, 512))
+        im = img2tensor(im, bgr2rgb=True, float32=True) / 255.
+        mask = cv2.imread(os.path.join(self.root_path_mask, name))  # [:,:,0]
+        mask = cv2.resize(mask, (512, 512))
+        mask = img2tensor(mask, bgr2rgb=True, float32=True) / 255.  # [0].unsqueeze(0)#/255.
+        sentence = file['sentence']
+        return {'im': im, 'mask': mask, 'sentence': sentence}
+    def __len__(self):
+        return len(self.files)

ldm/data/dataset_depth.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import json
+import cv2
+import os
+from basicsr.utils import img2tensor
+class DepthDataset():
+    def __init__(self, meta_file):
+        super(DepthDataset, self).__init__()
+        self.files = []
+        with open(meta_file, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                img_path = line.strip()
+                depth_img_path = img_path.rsplit('.', 1)[0] + '.depth.png'
+                txt_path = img_path.rsplit('.', 1)[0] + '.txt'
+                self.files.append({'img_path': img_path, 'depth_img_path': depth_img_path, 'txt_path': txt_path})
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        im = cv2.imread(file['img_path'])
+        im = img2tensor(im, bgr2rgb=True, float32=True) / 255.
+        depth = cv2.imread(file['depth_img_path'])  # [:,:,0]
+        depth = img2tensor(depth, bgr2rgb=True, float32=True) / 255.  # [0].unsqueeze(0)#/255.
+        with open(file['txt_path'], 'r') as fs:
+            sentence = fs.readline().strip()
+        return {'im': im, 'depth': depth, 'sentence': sentence}
+    def __len__(self):
+        return len(self.files)

ldm/data/dataset_laion.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# -*- coding: utf-8 -*-
+import numpy as np
+import os
+import pytorch_lightning as pl
+import torch
+import webdataset as wds
+from torchvision.transforms import transforms
+from ldm.util import instantiate_from_config
+def dict_collation_fn(samples, combine_tensors=True, combine_scalars=True):
+    """Take a list  of samples (as dictionary) and create a batch, preserving the keys.
+    If `tensors` is True, `ndarray` objects are combined into
+    tensor batches.
+    :param dict samples: list of samples
+    :param bool tensors: whether to turn lists of ndarrays into a single ndarray
+    :returns: single sample consisting of a batch
+    :rtype: dict
+    """
+    keys = set.intersection(*[set(sample.keys()) for sample in samples])
+    batched = {key: [] for key in keys}
+    for s in samples:
+        [batched[key].append(s[key]) for key in batched]
+    result = {}
+    for key in batched:
+        if isinstance(batched[key][0], (int, float)):
+            if combine_scalars:
+                result[key] = np.array(list(batched[key]))
+        elif isinstance(batched[key][0], torch.Tensor):
+            if combine_tensors:
+                result[key] = torch.stack(list(batched[key]))
+        elif isinstance(batched[key][0], np.ndarray):
+            if combine_tensors:
+                result[key] = np.array(list(batched[key]))
+        else:
+            result[key] = list(batched[key])
+    return result
+class WebDataModuleFromConfig(pl.LightningDataModule):
+    def __init__(self,
+                 tar_base,
+                 batch_size,
+                 train=None,
+                 validation=None,
+                 test=None,
+                 num_workers=4,
+                 multinode=True,
+                 min_size=None,
+                 max_pwatermark=1.0,
+                 **kwargs):
+        super().__init__()
+        print(f'Setting tar base to {tar_base}')
+        self.tar_base = tar_base
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.train = train
+        self.validation = validation
+        self.test = test
+        self.multinode = multinode
+        self.min_size = min_size  # filter out very small images
+        self.max_pwatermark = max_pwatermark  # filter out watermarked images
+    def make_loader(self, dataset_config):
+        image_transforms = [instantiate_from_config(tt) for tt in dataset_config.image_transforms]
+        image_transforms = transforms.Compose(image_transforms)
+        process = instantiate_from_config(dataset_config['process'])
+        shuffle = dataset_config.get('shuffle', 0)
+        shardshuffle = shuffle > 0
+        nodesplitter = wds.shardlists.split_by_node if self.multinode else wds.shardlists.single_node_only
+        tars = os.path.join(self.tar_base, dataset_config.shards)
+        dset = wds.WebDataset(
+            tars, nodesplitter=nodesplitter, shardshuffle=shardshuffle,
+            handler=wds.warn_and_continue).repeat().shuffle(shuffle)
+        print(f'Loading webdataset with {len(dset.pipeline[0].urls)} shards.')
+        dset = (
+            dset.select(self.filter_keys).decode('pil',
+                                                 handler=wds.warn_and_continue).select(self.filter_size).map_dict(
+                                                     jpg=image_transforms, handler=wds.warn_and_continue).map(process))
+        dset = (dset.batched(self.batch_size, partial=False, collation_fn=dict_collation_fn))
+        loader = wds.WebLoader(dset, batch_size=None, shuffle=False, num_workers=self.num_workers)
+        return loader
+    def filter_size(self, x):
+        if self.min_size is None:
+            return True
+        try:
+            return x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size and x[
+                'json']['pwatermark'] <= self.max_pwatermark
+        except Exception:
+            return False
+    def filter_keys(self, x):
+        try:
+            return ("jpg" in x) and ("txt" in x)
+        except Exception:
+            return False
+    def train_dataloader(self):
+        return self.make_loader(self.train)
+    def val_dataloader(self):
+        return None
+    def test_dataloader(self):
+        return None
+if __name__ == '__main__':
+    from omegaconf import OmegaConf
+    config = OmegaConf.load("configs/stable-diffusion/train_canny_sd_v1.yaml")
+    datamod = WebDataModuleFromConfig(**config["data"]["params"])
+    dataloader = datamod.train_dataloader()
+    for batch in dataloader:
+        print(batch.keys())
+        print(batch['jpg'].shape)

ldm/data/dataset_wikiart.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import json
+import os.path
+from PIL import Image
+from torch.utils.data import DataLoader
+from transformers import CLIPProcessor
+from torchvision.transforms import transforms
+import pytorch_lightning as pl
+class WikiArtDataset():
+    def __init__(self, meta_file):
+        super(WikiArtDataset, self).__init__()
+        self.files = []
+        with open(meta_file, 'r') as f:
+            js = json.load(f)
+            for img_path in js:
+                img_name = os.path.splitext(os.path.basename(img_path))[0]
+                caption = img_name.split('_')[-1]
+                caption = caption.split('-')
+                j = len(caption) - 1
+                while j >= 0:
+                    if not caption[j].isdigit():
+                        break
+                    j -= 1
+                if j < 0:
+                    continue
+                sentence = ' '.join(caption[:j + 1])
+                self.files.append({'img_path': os.path.join('datasets/wikiart', img_path), 'sentence': sentence})
+        version = 'openai/clip-vit-large-patch14'
+        self.processor = CLIPProcessor.from_pretrained(version)
+        self.jpg_transform = transforms.Compose([
+            transforms.Resize(512),
+            transforms.RandomCrop(512),
+            transforms.ToTensor(),
+        ])
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        im = Image.open(file['img_path'])
+        im_tensor = self.jpg_transform(im)
+        clip_im = self.processor(images=im, return_tensors="pt")['pixel_values'][0]
+        return {'jpg': im_tensor, 'style': clip_im, 'txt': file['sentence']}
+    def __len__(self):
+        return len(self.files)
+class WikiArtDataModule(pl.LightningDataModule):
+    def __init__(self, meta_file, batch_size, num_workers):
+        super(WikiArtDataModule, self).__init__()
+        self.train_dataset = WikiArtDataset(meta_file)
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers,
+                          pin_memory=True)

ldm/data/imagenet.py DELETED Viewed

@@ -1,394 +0,0 @@
-import os, yaml, pickle, shutil, tarfile, glob
-import cv2
-import albumentations
-import PIL
-import numpy as np
-import torchvision.transforms.functional as TF
-from omegaconf import OmegaConf
-from functools import partial
-from PIL import Image
-from tqdm import tqdm
-from torch.utils.data import Dataset, Subset
-import taming.data.utils as tdu
-from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
-from taming.data.imagenet import ImagePaths
-from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
-def synset2idx(path_to_yaml="data/index_synset.yaml"):
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    return dict((v,k) for k,v in di2s.items())
-class ImageNetBase(Dataset):
-    def __init__(self, config=None):
-        self.config = config or OmegaConf.create()
-        if not type(self.config)==dict:
-            self.config = OmegaConf.to_container(self.config)
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
-        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
-        self._prepare()
-        self._prepare_synset_to_human()
-        self._prepare_idx_to_synset()
-        self._prepare_human_to_integer_label()
-        self._load()
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, i):
-        return self.data[i]
-    def _prepare(self):
-        raise NotImplementedError()
-    def _filter_relpaths(self, relpaths):
-        ignore = set([
-            "n06596364_9591.JPEG",
-        ])
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
-            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
-            files = []
-            for rpath in relpaths:
-                syn = rpath.split("/")[0]
-                if syn in synsets:
-                    files.append(rpath)
-            return files
-        else:
-            return relpaths
-    def _prepare_synset_to_human(self):
-        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict)==SIZE):
-            download(URL, self.human_dict)
-    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if (not os.path.exists(self.idx2syn)):
-            download(URL, self.idx2syn)
-    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if (not os.path.exists(self.human2integer)):
-            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
-            lines = f.read().splitlines()
-            assert len(lines) == 1000
-            self.human2integer_dict = dict()
-            for line in lines:
-                value, key = line.split(":")
-                self.human2integer_dict[key] = int(value)
-    def _load(self):
-        with open(self.txt_filelist, "r") as f:
-            self.relpaths = f.read().splitlines()
-            l1 = len(self.relpaths)
-            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
-        self.synsets = [p.split("/")[0] for p in self.relpaths]
-        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
-        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
-        if not self.keep_orig_class_label:
-            self.class_labels = [class_dict[s] for s in self.synsets]
-        else:
-            self.class_labels = [self.synset2idx[s] for s in self.synsets]
-        with open(self.human_dict, "r") as f:
-            human_dict = f.read().splitlines()
-            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
-        self.human_labels = [human_dict[s] for s in self.synsets]
-        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
-        }
-        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(self.abspaths,
-                                   labels=labels,
-                                   size=self.size,
-                                   random_crop=self.random_crop,
-                                   )
-        else:
-            self.data = self.abspaths
-class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = [
-        "ILSVRC2012_img_train.tar",
-    ]
-    SIZES = [
-        147897477120,
-    ]
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.process_images = process_images
-        self.data_root = data_root
-        super().__init__(**kwargs)
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
-                                    default=True)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
-                for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
-                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
-                        tar.extractall(path=subdir)
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-            tdu.mark_prepared(self.root)
-class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
-    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
-    ]
-    SIZES = [
-        6744924160,
-        1950000,
-    ]
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.data_root = data_root
-        self.process_images = process_images
-        super().__init__(**kwargs)
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
-                                    default=False)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
-                    download(self.VS_URL, vspath)
-                with open(vspath, "r") as f:
-                    synset_dict = f.read().splitlines()
-                    synset_dict = dict(line.split() for line in synset_dict)
-                print("Reorganizing into synset folders")
-                synsets = np.unique(list(synset_dict.values()))
-                for s in synsets:
-                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
-                for k, v in synset_dict.items():
-                    src = os.path.join(datadir, k)
-                    dst = os.path.join(datadir, v)
-                    shutil.move(src, dst)
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-            tdu.mark_prepared(self.root)
-class ImageNetSR(Dataset):
-    def __init__(self, size=None,
-                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
-                 random_crop=True):
-        """
-        Imagenet Superresolution Dataloader
-        Performs following ops in order:
-        1.  crops a crop of size s from image either as random or center crop
-        2.  resizes crop to size with cv2.area_interpolation
-        3.  degrades resized crop with degradation_fn
-        :param size: resizing to size after cropping
-        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
-        :param downscale_f: Low Resolution Downsample factor
-        :param min_crop_f: determines crop size s,
-          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
-        :param max_crop_f: ""
-        :param data_root:
-        :param random_crop:
-        """
-        self.base = self.get_base()
-        assert size
-        assert (size / downscale_f).is_integer()
-        self.size = size
-        self.LR_size = int(size / downscale_f)
-        self.min_crop_f = min_crop_f
-        self.max_crop_f = max_crop_f
-        assert(max_crop_f <= 1.)
-        self.center_crop = not random_crop
-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
-        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
-        else:
-            interpolation_fn = {
-            "cv_nearest": cv2.INTER_NEAREST,
-            "cv_bilinear": cv2.INTER_LINEAR,
-            "cv_bicubic": cv2.INTER_CUBIC,
-            "cv_area": cv2.INTER_AREA,
-            "cv_lanczos": cv2.INTER_LANCZOS4,
-            "pil_nearest": PIL.Image.NEAREST,
-            "pil_bilinear": PIL.Image.BILINEAR,
-            "pil_bicubic": PIL.Image.BICUBIC,
-            "pil_box": PIL.Image.BOX,
-            "pil_hamming": PIL.Image.HAMMING,
-            "pil_lanczos": PIL.Image.LANCZOS,
-            }[degradation]
-            self.pil_interpolation = degradation.startswith("pil_")
-            if self.pil_interpolation:
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
-            else:
-                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
-                                                                          interpolation=interpolation_fn)
-    def __len__(self):
-        return len(self.base)
-    def __getitem__(self, i):
-        example = self.base[i]
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        image = np.array(image).astype(np.uint8)
-        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
-        crop_side_len = int(crop_side_len)
-        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
-        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
-        if self.pil_interpolation:
-            image_pil = PIL.Image.fromarray(image)
-            LR_image = self.degradation_process(image_pil)
-            LR_image = np.array(LR_image).astype(np.uint8)
-        else:
-            LR_image = self.degradation_process(image=image)["image"]
-        example["image"] = (image/127.5 - 1.0).astype(np.float32)
-        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
-        return example
-class ImageNetSRTrain(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False,)
-        return Subset(dset, indices)
-class ImageNetSRValidation(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False,)
-        return Subset(dset, indices)

ldm/data/lsun.py DELETED Viewed

@@ -1,92 +0,0 @@
-import os
-import numpy as np
-import PIL
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-class LSUNBase(Dataset):
-    def __init__(self,
-                 txt_file,
-                 data_root,
-                 size=None,
-                 interpolation="bicubic",
-                 flip_p=0.5
-                 ):
-        self.data_paths = txt_file
-        self.data_root = data_root
-        with open(self.data_paths, "r") as f:
-            self.image_paths = f.read().splitlines()
-        self._length = len(self.image_paths)
-        self.labels = {
-            "relative_file_path_": [l for l in self.image_paths],
-            "file_path_": [os.path.join(self.data_root, l)
-                           for l in self.image_paths],
-        }
-        self.size = size
-        self.interpolation = {"linear": PIL.Image.LINEAR,
-                              "bilinear": PIL.Image.BILINEAR,
-                              "bicubic": PIL.Image.BICUBIC,
-                              "lanczos": PIL.Image.LANCZOS,
-                              }[interpolation]
-        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
-    def __len__(self):
-        return self._length
-    def __getitem__(self, i):
-        example = dict((k, self.labels[k][i]) for k in self.labels)
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-        crop = min(img.shape[0], img.shape[1])
-        h, w, = img.shape[0], img.shape[1]
-        img = img[(h - crop) // 2:(h + crop) // 2,
-              (w - crop) // 2:(w + crop) // 2]
-        image = Image.fromarray(img)
-        if self.size is not None:
-            image = image.resize((self.size, self.size), resample=self.interpolation)
-        image = self.flip(image)
-        image = np.array(image).astype(np.uint8)
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
-        return example
-class LSUNChurchesTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
-class LSUNChurchesValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
-                         flip_p=flip_p, **kwargs)
-class LSUNBedroomsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
-class LSUNBedroomsValidation(LSUNBase):
-    def __init__(self, flip_p=0.0, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
-                         flip_p=flip_p, **kwargs)
-class LSUNCatsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
-class LSUNCatsValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
-                         flip_p=flip_p, **kwargs)

ldm/data/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# -*- coding: utf-8 -*-
+import cv2
+import numpy as np
+from torchvision.transforms import transforms
+from torchvision.transforms.functional import to_tensor
+from transformers import CLIPProcessor
+from basicsr.utils import img2tensor
+class AddCannyFreezeThreshold(object):
+    def __init__(self, low_threshold=100, high_threshold=200):
+        self.low_threshold = low_threshold
+        self.high_threshold = high_threshold
+    def __call__(self, sample):
+        # sample['jpg'] is PIL image
+        x = sample['jpg']
+        img = cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR)
+        canny = cv2.Canny(img, self.low_threshold, self.high_threshold)[..., None]
+        sample['canny'] = img2tensor(canny, bgr2rgb=True, float32=True) / 255.
+        sample['jpg'] = to_tensor(x)
+        return sample
+class AddStyle(object):
+    def __init__(self, version):
+        self.processor = CLIPProcessor.from_pretrained(version)
+        self.pil_to_tensor = transforms.ToTensor()
+    def __call__(self, sample):
+        # sample['jpg'] is PIL image
+        x = sample['jpg']
+        style = self.processor(images=x, return_tensors="pt")['pixel_values'][0]
+        sample['style'] = style
+        sample['jpg'] = to_tensor(x)
+        return sample

ldm/inference_base.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import argparse
+import torch
+from omegaconf import OmegaConf
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.modules.encoders.adapter import Adapter, StyleAdapter, Adapter_light
+from ldm.modules.extra_condition.api import ExtraCondition
+from ldm.util import fix_cond_shapes, load_model_from_config, read_state_dict
+DEFAULT_NEGATIVE_PROMPT = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                          'fewer digits, cropped, worst quality, low quality'
+def get_base_argument_parser() -> argparse.ArgumentParser:
+    """get the base argument parser for inference scripts"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--outdir',
+        type=str,
+        help='dir to write results to',
+        default=None,
+    )
+    parser.add_argument(
+        '--prompt',
+        type=str,
+        nargs='?',
+        default=None,
+        help='positive prompt',
+    )
+    parser.add_argument(
+        '--neg_prompt',
+        type=str,
+        default=DEFAULT_NEGATIVE_PROMPT,
+        help='negative prompt',
+    )
+    parser.add_argument(
+        '--cond_path',
+        type=str,
+        default=None,
+        help='condition image path',
+    )
+    parser.add_argument(
+        '--cond_inp_type',
+        type=str,
+        default='image',
+        help='the type of the input condition image, take depth T2I as example, the input can be raw image, '
+        'which depth will be calculated, or the input can be a directly a depth map image',
+    )
+    parser.add_argument(
+        '--sampler',
+        type=str,
+        default='ddim',
+        choices=['ddim', 'plms'],
+        help='sampling algorithm, currently, only ddim and plms are supported, more are on the way',
+    )
+    parser.add_argument(
+        '--steps',
+        type=int,
+        default=50,
+        help='number of sampling steps',
+    )
+    parser.add_argument(
+        '--sd_ckpt',
+        type=str,
+        default='models/sd-v1-4.ckpt',
+        help='path to checkpoint of stable diffusion model, both .ckpt and .safetensor are supported',
+    )
+    parser.add_argument(
+        '--vae_ckpt',
+        type=str,
+        default=None,
+        help='vae checkpoint, anime SD models usually have seperate vae ckpt that need to be loaded',
+    )
+    parser.add_argument(
+        '--adapter_ckpt',
+        type=str,
+        default=None,
+        help='path to checkpoint of adapter',
+    )
+    parser.add_argument(
+        '--config',
+        type=str,
+        default='configs/stable-diffusion/sd-v1-inference.yaml',
+        help='path to config which constructs SD model',
+    )
+    parser.add_argument(
+        '--max_resolution',
+        type=float,
+        default=512 * 512,
+        help='max image height * width, only for computer with limited vram',
+    )
+    parser.add_argument(
+        '--resize_short_edge',
+        type=int,
+        default=None,
+        help='resize short edge of the input image, if this arg is set, max_resolution will not be used',
+    )
+    parser.add_argument(
+        '--C',
+        type=int,
+        default=4,
+        help='latent channels',
+    )
+    parser.add_argument(
+        '--f',
+        type=int,
+        default=8,
+        help='downsampling factor',
+    )
+    parser.add_argument(
+        '--scale',
+        type=float,
+        default=7.5,
+        help='unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))',
+    )
+    parser.add_argument(
+        '--cond_tau',
+        type=float,
+        default=1.0,
+        help='timestamp parameter that determines until which step the adapter is applied, '
+        'similar as Prompt-to-Prompt tau')
+    parser.add_argument(
+        '--cond_weight',
+        type=float,
+        default=1.0,
+        help='the adapter features are multiplied by the cond_weight. The larger the cond_weight, the more aligned '
+        'the generated image and condition will be, but the generated quality may be reduced',
+    )
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=42,
+    )
+    parser.add_argument(
+        '--n_samples',
+        type=int,
+        default=4,
+        help='# of samples to generate',
+    )
+    return parser
+def get_sd_models(opt):
+    """
+    build stable diffusion model, sampler
+    """
+    # SD
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, opt.sd_ckpt, opt.vae_ckpt)
+    sd_model = model.to(opt.device)
+    # sampler
+    if opt.sampler == 'plms':
+        sampler = PLMSSampler(model)
+    elif opt.sampler == 'ddim':
+        sampler = DDIMSampler(model)
+    else:
+        raise NotImplementedError
+    return sd_model, sampler
+def get_t2i_adapter_models(opt):
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, opt.sd_ckpt, opt.vae_ckpt)
+    adapter_ckpt_path = getattr(opt, f'{opt.which_cond}_adapter_ckpt', None)
+    if adapter_ckpt_path is None:
+        adapter_ckpt_path = getattr(opt, 'adapter_ckpt')
+    adapter_ckpt = read_state_dict(adapter_ckpt_path)
+    new_state_dict = {}
+    for k, v in adapter_ckpt.items():
+        if not k.startswith('adapter.'):
+            new_state_dict[f'adapter.{k}'] = v
+        else:
+            new_state_dict[k] = v
+    m, u = model.load_state_dict(new_state_dict, strict=False)
+    if len(u) > 0:
+        print(f"unexpected keys in loading adapter ckpt {adapter_ckpt_path}:")
+        print(u)
+    model = model.to(opt.device)
+    # sampler
+    if opt.sampler == 'plms':
+        sampler = PLMSSampler(model)
+    elif opt.sampler == 'ddim':
+        sampler = DDIMSampler(model)
+    else:
+        raise NotImplementedError
+    return model, sampler
+def get_cond_ch(cond_type: ExtraCondition):
+    if cond_type == ExtraCondition.sketch or cond_type == ExtraCondition.canny:
+        return 1
+    return 3
+def get_adapters(opt, cond_type: ExtraCondition):
+    adapter = {}
+    cond_weight = getattr(opt, f'{cond_type.name}_weight', None)
+    if cond_weight is None:
+        cond_weight = getattr(opt, 'cond_weight')
+    adapter['cond_weight'] = cond_weight
+    if cond_type == ExtraCondition.style:
+        adapter['model'] = StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8).to(opt.device)
+    elif cond_type == ExtraCondition.color:
+        adapter['model'] = Adapter_light(
+            cin=64 * get_cond_ch(cond_type),
+            channels=[320, 640, 1280, 1280],
+            nums_rb=4).to(opt.device)
+    else:
+        adapter['model'] = Adapter(
+            cin=64 * get_cond_ch(cond_type),
+            channels=[320, 640, 1280, 1280][:4],
+            nums_rb=2,
+            ksize=1,
+            sk=True,
+            use_conv=False).to(opt.device)
+    ckpt_path = getattr(opt, f'{cond_type.name}_adapter_ckpt', None)
+    if ckpt_path is None:
+        ckpt_path = getattr(opt, 'adapter_ckpt')
+    adapter['model'].load_state_dict(torch.load(ckpt_path))
+    return adapter
+def diffusion_inference(opt, model, sampler, adapter_features, append_to_context=None):
+    # get text embedding
+    c = model.get_learned_conditioning([opt.prompt])
+    if opt.scale != 1.0:
+        uc = model.get_learned_conditioning([opt.neg_prompt])
+    else:
+        uc = None
+    c, uc = fix_cond_shapes(model, c, uc)
+    if not hasattr(opt, 'H'):
+        opt.H = 512
+        opt.W = 512
+    shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+    samples_latents, _ = sampler.sample(
+        S=opt.steps,
+        conditioning=c,
+        batch_size=1,
+        shape=shape,
+        verbose=False,
+        unconditional_guidance_scale=opt.scale,
+        unconditional_conditioning=uc,
+        x_T=None,
+        features_adapter=adapter_features,
+        append_to_context=append_to_context,
+        cond_tau=opt.cond_tau,
+    )
+    x_samples = model.decode_first_stage(samples_latents)
+    x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+    return x_samples

ldm/models/autoencoder.py CHANGED Viewed

@@ -1,64 +1,65 @@
 import torch
 import pytorch_lightning as pl
 import torch.nn.functional as F
 from contextlib import contextmanager
-from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
 from ldm.modules.diffusionmodules.model import Encoder, Decoder
 from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
 from ldm.util import instantiate_from_config
-class VQModel(pl.LightningModule):
     def __init__(self,
                  ddconfig,
                  lossconfig,
-                 n_embed,
                  embed_dim,
                  ckpt_path=None,
                  ignore_keys=[],
                  image_key="image",
                  colorize_nlabels=None,
                  monitor=None,
-                 batch_resize_range=None,
-                 scheduler_config=None,
-                 lr_g_factor=1.0,
-                 remap=None,
-                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
-                 use_ema=False
                  ):
         super().__init__()
-        self.embed_dim = embed_dim
-        self.n_embed = n_embed
         self.image_key = image_key
         self.encoder = Encoder(**ddconfig)
         self.decoder = Decoder(**ddconfig)
         self.loss = instantiate_from_config(lossconfig)
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
-                                        remap=remap,
-                                        sane_index_shape=sane_index_shape)
-        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
         if colorize_nlabels is not None:
             assert type(colorize_nlabels)==int
             self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
         if monitor is not None:
             self.monitor = monitor
-        self.batch_resize_range = batch_resize_range
-        if self.batch_resize_range is not None:
-            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
-        self.use_ema = use_ema
         if self.use_ema:
-            self.model_ema = LitEma(self)
             print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        self.scheduler_config = scheduler_config
-        self.lr_g_factor = lr_g_factor
     @contextmanager
     def ema_scope(self, context=None):
@@ -75,252 +76,10 @@ class VQModel(pl.LightningModule):
                 if context is not None:
                     print(f"{context}: Restored training weights")
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-            print(f"Unexpected Keys: {unexpected}")
     def on_train_batch_end(self, *args, **kwargs):
         if self.use_ema:
             self.model_ema(self)
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        quant, emb_loss, info = self.quantize(h)
-        return quant, emb_loss, info
-    def encode_to_prequant(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-    def decode(self, quant):
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-    def decode_code(self, code_b):
-        quant_b = self.quantize.embed_code(code_b)
-        dec = self.decode(quant_b)
-        return dec
-    def forward(self, input, return_pred_indices=False):
-        quant, diff, (_,_,ind) = self.encode(input)
-        dec = self.decode(quant)
-        if return_pred_indices:
-            return dec, diff, ind
-        return dec, diff
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        if self.batch_resize_range is not None:
-            lower_size = self.batch_resize_range[0]
-            upper_size = self.batch_resize_range[1]
-            if self.global_step <= 4:
-                # do the first few batches with max size to avoid later oom
-                new_resize = upper_size
-            else:
-                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
-            if new_resize != x.shape[2]:
-                x = F.interpolate(x, size=new_resize, mode="bicubic")
-            x = x.detach()
-        return x
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        # https://github.com/pytorch/pytorch/issues/37142
-        # try not to fool the heuristics
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-        if optimizer_idx == 0:
-            # autoencode
-            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train",
-                                            predicted_indices=ind)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return aeloss
-        if optimizer_idx == 1:
-            # discriminator
-            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return discloss
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
-        return log_dict
-    def _validation_step(self, batch, batch_idx, suffix=""):
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
-                                        self.global_step,
-                                        last_layer=self.get_last_layer(),
-                                        split="val"+suffix,
-                                        predicted_indices=ind
-                                        )
-        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
-                                            self.global_step,
-                                            last_layer=self.get_last_layer(),
-                                            split="val"+suffix,
-                                            predicted_indices=ind
-                                            )
-        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log(f"val{suffix}/rec_loss", rec_loss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        self.log(f"val{suffix}/aeloss", aeloss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            del log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-    def configure_optimizers(self):
-        lr_d = self.learning_rate
-        lr_g = self.lr_g_factor*self.learning_rate
-        print("lr_d", lr_d)
-        print("lr_g", lr_g)
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quantize.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr_g, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr_d, betas=(0.5, 0.9))
-        if self.scheduler_config is not None:
-            scheduler = instantiate_from_config(self.scheduler_config)
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-                {
-                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-            ]
-            return [opt_ae, opt_disc], scheduler
-        return [opt_ae, opt_disc], []
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if only_inputs:
-            log["inputs"] = x
-            return log
-        xrec, _ = self(x)
-        if x.shape[1] > 3:
-            # colorize with random projection
-            assert xrec.shape[1] > 3
-            x = self.to_rgb(x)
-            xrec = self.to_rgb(xrec)
-        log["inputs"] = x
-        log["reconstructions"] = xrec
-        if plot_ema:
-            with self.ema_scope():
-                xrec_ema, _ = self(x)
-                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
-                log["reconstructions_ema"] = xrec_ema
-        return log
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-class VQModelInterface(VQModel):
-    def __init__(self, embed_dim, *args, **kwargs):
-        super().__init__(embed_dim=embed_dim, *args, **kwargs)
-        self.embed_dim = embed_dim
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-    def decode(self, h, force_not_quantize=False):
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 ):
-        super().__init__()
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
     def encode(self, x):
         h = self.encoder(x)
         moments = self.quant_conv(h)
@@ -370,25 +129,33 @@ class AutoencoderKL(pl.LightningModule):
             return discloss
     def validation_step(self, batch, batch_idx):
         inputs = self.get_input(batch, self.image_key)
         reconstructions, posterior = self(inputs)
         aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val")
         discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val")
-        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
         self.log_dict(log_dict_ae)
         self.log_dict(log_dict_disc)
         return self.log_dict
     def configure_optimizers(self):
         lr = self.learning_rate
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
                                   lr=lr, betas=(0.5, 0.9))
         opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
                                     lr=lr, betas=(0.5, 0.9))
@@ -398,7 +165,7 @@ class AutoencoderKL(pl.LightningModule):
         return self.decoder.conv_out.weight
     @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, **kwargs):
         log = dict()
         x = self.get_input(batch, self.image_key)
         x = x.to(self.device)
@@ -423,9 +190,9 @@ class AutoencoderKL(pl.LightningModule):
         return x
-class IdentityFirstStage(torch.nn.Module):
     def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
         super().__init__()
     def encode(self, x, *args, **kwargs):
@@ -441,3 +208,4 @@ class IdentityFirstStage(torch.nn.Module):
     def forward(self, x, *args, **kwargs):
         return x

 import torch
 import pytorch_lightning as pl
 import torch.nn.functional as F
+import torch.nn as nn
 from contextlib import contextmanager
 from ldm.modules.diffusionmodules.model import Encoder, Decoder
 from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
 from ldm.util import instantiate_from_config
+from ldm.modules.ema import LitEma
+class AutoencoderKL(pl.LightningModule):
     def __init__(self,
                  ddconfig,
                  lossconfig,
                  embed_dim,
                  ckpt_path=None,
                  ignore_keys=[],
                  image_key="image",
                  colorize_nlabels=None,
                  monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False
                  ):
         super().__init__()
+        self.learn_logvar = learn_logvar
         self.image_key = image_key
         self.encoder = Encoder(**ddconfig)
         self.decoder = Decoder(**ddconfig)
         self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
         if colorize_nlabels is not None:
             assert type(colorize_nlabels)==int
             self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
         if monitor is not None:
             self.monitor = monitor
+        self.use_ema = ema_decay is not None
         if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
             print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
     @contextmanager
     def ema_scope(self, context=None):
                 if context is not None:
                     print(f"{context}: Restored training weights")
     def on_train_batch_end(self, *args, **kwargs):
         if self.use_ema:
             self.model_ema(self)
     def encode(self, x):
         h = self.encoder(x)
         moments = self.quant_conv(h)
             return discloss
     def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+        return log_dict
+    def _validation_step(self, batch, batch_idx, postfix=""):
         inputs = self.get_input(batch, self.image_key)
         reconstructions, posterior = self(inputs)
         aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val"+postfix)
         discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val"+postfix)
+        self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
         self.log_dict(log_dict_ae)
         self.log_dict(log_dict_disc)
         return self.log_dict
     def configure_optimizers(self):
         lr = self.learning_rate
+        ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
+            self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
+        if self.learn_logvar:
+            print(f"{self.__class__.__name__}: Learning logvar")
+            ae_params_list.append(self.loss.logvar)
+        opt_ae = torch.optim.Adam(ae_params_list,
                                   lr=lr, betas=(0.5, 0.9))
         opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
                                     lr=lr, betas=(0.5, 0.9))
         return self.decoder.conv_out.weight
     @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
         log = dict()
         x = self.get_input(batch, self.image_key)
         x = x.to(self.device)
         return x
+class IdentityFirstStage(nn.Module):
     def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface
         super().__init__()
     def encode(self, x, *args, **kwargs):
     def forward(self, x, *args, **kwargs):
         return x

ldm/models/diffusion/classifier.py DELETED Viewed

@@ -1,267 +0,0 @@
-import os
-import torch
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-from torch.nn import functional as F
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import LambdaLR
-from copy import deepcopy
-from einops import rearrange
-from glob import glob
-from natsort import natsorted
-from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
-from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
-__models__ = {
-    'class_label': EncoderUNetModel,
-    'segmentation': UNetModel
-}
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-class NoisyLatentImageClassifier(pl.LightningModule):
-    def __init__(self,
-                 diffusion_path,
-                 num_classes,
-                 ckpt_path=None,
-                 pool='attention',
-                 label_key=None,
-                 diffusion_ckpt_path=None,
-                 scheduler_config=None,
-                 weight_decay=1.e-2,
-                 log_steps=10,
-                 monitor='val/loss',
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_classes = num_classes
-        # get latest config of diffusion model
-        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
-        self.diffusion_config = OmegaConf.load(diffusion_config).model
-        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
-        self.load_diffusion()
-        self.monitor = monitor
-        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
-        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
-        self.log_steps = log_steps
-        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
-            else self.diffusion_model.cond_stage_key
-        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
-        if self.label_key not in __models__:
-            raise NotImplementedError()
-        self.load_classifier(ckpt_path, pool)
-        self.scheduler_config = scheduler_config
-        self.use_scheduler = self.scheduler_config is not None
-        self.weight_decay = weight_decay
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-    def load_diffusion(self):
-        model = instantiate_from_config(self.diffusion_config)
-        self.diffusion_model = model.eval()
-        self.diffusion_model.train = disabled_train
-        for param in self.diffusion_model.parameters():
-            param.requires_grad = False
-    def load_classifier(self, ckpt_path, pool):
-        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
-        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
-        model_config.out_channels = self.num_classes
-        if self.label_key == 'class_label':
-            model_config.pool = pool
-        self.model = __models__[self.label_key](**model_config)
-        if ckpt_path is not None:
-            print('#####################################################################')
-            print(f'load from ckpt "{ckpt_path}"')
-            print('#####################################################################')
-            self.init_from_ckpt(ckpt_path)
-    @torch.no_grad()
-    def get_x_noisy(self, x, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x))
-        continuous_sqrt_alpha_cumprod = None
-        if self.diffusion_model.use_continuous_noise:
-            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
-            # todo: make sure t+1 is correct here
-        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
-                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
-    def forward(self, x_noisy, t, *args, **kwargs):
-        return self.model(x_noisy, t)
-    @torch.no_grad()
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-    @torch.no_grad()
-    def get_conditioning(self, batch, k=None):
-        if k is None:
-            k = self.label_key
-        assert k is not None, 'Needs to provide label key'
-        targets = batch[k].to(self.device)
-        if self.label_key == 'segmentation':
-            targets = rearrange(targets, 'b h w c -> b c h w')
-            for down in range(self.numd):
-                h, w = targets.shape[-2:]
-                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
-            # targets = rearrange(targets,'b c h w -> b h w c')
-        return targets
-    def compute_top_k(self, logits, labels, k, reduction="mean"):
-        _, top_ks = torch.topk(logits, k, dim=1)
-        if reduction == "mean":
-            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
-        elif reduction == "none":
-            return (top_ks == labels[:, None]).float().sum(dim=-1)
-    def on_train_epoch_start(self):
-        # save some memory
-        self.diffusion_model.model.to('cpu')
-    @torch.no_grad()
-    def write_logs(self, loss, logits, targets):
-        log_prefix = 'train' if self.training else 'val'
-        log = {}
-        log[f"{log_prefix}/loss"] = loss.mean()
-        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
-            logits, targets, k=1, reduction="mean"
-        )
-        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
-            logits, targets, k=5, reduction="mean"
-        )
-        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
-        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
-        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
-        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
-    def shared_step(self, batch, t=None):
-        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
-        targets = self.get_conditioning(batch)
-        if targets.dim() == 4:
-            targets = targets.argmax(dim=1)
-        if t is None:
-            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
-        else:
-            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
-        x_noisy = self.get_x_noisy(x, t)
-        logits = self(x_noisy, t)
-        loss = F.cross_entropy(logits, targets, reduction='none')
-        self.write_logs(loss.detach(), logits.detach(), targets.detach())
-        loss = loss.mean()
-        return loss, logits, x_noisy, targets
-    def training_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-        return loss
-    def reset_noise_accs(self):
-        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
-                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
-    def on_validation_start(self):
-        self.reset_noise_accs()
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-        for t in self.noisy_acc:
-            _, logits, _, targets = self.shared_step(batch, t)
-            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
-            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
-        return loss
-    def configure_optimizers(self):
-        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
-        if self.use_scheduler:
-            scheduler = instantiate_from_config(self.scheduler_config)
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [optimizer], scheduler
-        return optimizer
-    @torch.no_grad()
-    def log_images(self, batch, N=8, *args, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.diffusion_model.first_stage_key)
-        log['inputs'] = x
-        y = self.get_conditioning(batch)
-        if self.label_key == 'class_label':
-            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-            log['labels'] = y
-        if ismap(y):
-            log['labels'] = self.diffusion_model.to_rgb(y)
-            for step in range(self.log_steps):
-                current_time = step * self.log_time_interval
-                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
-                log[f'inputs@t{current_time}'] = x_noisy
-                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
-                pred = rearrange(pred, 'b h w c -> b c h w')
-                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
-        for key in log:
-            log[key] = log[key][:N]
-        return log

ldm/models/diffusion/ddim.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-from functools import partial
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
     extract_into_tensor
@@ -24,7 +23,7 @@ class DDIMSampler(object):
     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
@@ -43,14 +42,14 @@ class DDIMSampler(object):
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
     @torch.no_grad()
@@ -75,6 +74,9 @@ class DDIMSampler(object):
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
@@ -107,6 +109,9 @@ class DDIMSampler(object):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
                                                     )
         return samples, intermediates
@@ -116,7 +121,8 @@ class DDIMSampler(object):
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
@@ -131,7 +137,7 @@ class DDIMSampler(object):
             timesteps = self.ddim_timesteps[:subset_end]
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running DDIM Sampling with {total_steps} timesteps")
@@ -151,7 +157,13 @@ class DDIMSampler(object):
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
             img, pred_x0 = outs
             if callback: callback(i)
             if img_callback: img_callback(pred_x0, i)
@@ -165,20 +177,55 @@ class DDIMSampler(object):
     @torch.no_grad()
     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
         b, *_, device = *x.shape, x.device
         if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
         else:
             x_in = torch.cat([x] * 2)
             t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
         if score_corrector is not None:
-            assert self.model.parameterization == "eps"
             e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
@@ -189,14 +236,18 @@ class DDIMSampler(object):
         a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
         a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
         sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
         # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
         if quantize_denoised:
             pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
         # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
         noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
         if noise_dropout > 0.:
             noise = torch.nn.functional.dropout(noise, p=noise_dropout)
@@ -238,4 +289,4 @@ class DDIMSampler(object):
             x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
                                           unconditional_guidance_scale=unconditional_guidance_scale,
                                           unconditional_conditioning=unconditional_conditioning)
-        return x_dec

 import torch
 import numpy as np
 from tqdm import tqdm
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
     extract_into_tensor
     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta, verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
     @torch.no_grad()
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
+               features_adapter=None,
+               append_to_context=None,
+               cond_tau=0.4,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
+                                                    features_adapter=features_adapter,
+                                                    append_to_context=append_to_context,
+                                                    cond_tau=cond_tau,
                                                     )
         return samples, intermediates
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None, cond_tau=0.4):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
             timesteps = self.ddim_timesteps[:subset_end]
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running DDIM Sampling with {total_steps} timesteps")
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      features_adapter=None if index < int(
+                                          (1 - cond_tau) * total_steps) else features_adapter,
+                                      # TODO support style_cond_tau
+                                      append_to_context=None if index < int(
+                                          0.5 * total_steps) else append_to_context,
+                                      )
             img, pred_x0 = outs
             if callback: callback(i)
             if img_callback: img_callback(pred_x0, i)
     @torch.no_grad()
     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None):
         b, *_, device = *x.shape, x.device
         if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            if append_to_context is not None:
+                model_output = self.model.apply_model(x, t, torch.cat([c, append_to_context], dim=1),
+                                                      features_adapter=features_adapter)
+            else:
+                model_output = self.model.apply_model(x, t, c, features_adapter=features_adapter)
         else:
             x_in = torch.cat([x] * 2)
             t_in = torch.cat([t] * 2)
+            if isinstance(c, dict):
+                assert isinstance(unconditional_conditioning, dict)
+                c_in = dict()
+                for k in c:
+                    if isinstance(c[k], list):
+                        c_in[k] = [torch.cat([
+                            unconditional_conditioning[k][i],
+                            c[k][i]]) for i in range(len(c[k]))]
+                    else:
+                        c_in[k] = torch.cat([
+                            unconditional_conditioning[k],
+                            c[k]])
+            elif isinstance(c, list):
+                c_in = list()
+                assert isinstance(unconditional_conditioning, list)
+                for i in range(len(c)):
+                    c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
+            else:
+                if append_to_context is not None:
+                    pad_len = append_to_context.size(1)
+                    new_unconditional_conditioning = torch.cat(
+                        [unconditional_conditioning, unconditional_conditioning[:, -pad_len:, :]], dim=1)
+                    new_c = torch.cat([c, append_to_context], dim=1)
+                    c_in = torch.cat([new_unconditional_conditioning, new_c])
+                else:
+                    c_in = torch.cat([unconditional_conditioning, c])
+            model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in, features_adapter=features_adapter).chunk(2)
+            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
+        if self.model.parameterization == "v":
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
         if score_corrector is not None:
+            assert self.model.parameterization == "eps", 'not implemented'
             e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
         a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
         a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
         sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
         # current prediction for x_0
+        if self.model.parameterization != "v":
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
         if quantize_denoised:
             pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
         # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
         noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
         if noise_dropout > 0.:
             noise = torch.nn.functional.dropout(noise, p=noise_dropout)
             x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
                                           unconditional_guidance_scale=unconditional_guidance_scale,
                                           unconditional_conditioning=unconditional_conditioning)
+        return x_dec

ldm/models/diffusion/ddpm.py CHANGED Viewed

@@ -12,16 +12,18 @@ import numpy as np
 import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
 from einops import rearrange, repeat
-from contextlib import contextmanager
 from functools import partial
 from tqdm import tqdm
 from torchvision.utils import make_grid
 from pytorch_lightning.utilities.distributed import rank_zero_only
 from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
 from ldm.modules.ema import LitEma
 from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
 from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
 from ldm.models.diffusion.ddim import DDIMSampler
@@ -71,9 +73,13 @@ class DDPM(pl.LightningModule):
                  use_positional_encodings=False,
                  learn_logvar=False,
                  logvar_init=0.,
                  ):
         super().__init__()
-        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
         self.parameterization = parameterization
         print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
         self.cond_stage_model = None
@@ -100,8 +106,18 @@ class DDPM(pl.LightningModule):
         if monitor is not None:
             self.monitor = monitor
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
         self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
                                linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
@@ -113,6 +129,9 @@ class DDPM(pl.LightningModule):
         if self.learn_logvar:
             self.logvar = nn.Parameter(self.logvar, requires_grad=True)
     def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
@@ -146,7 +165,7 @@ class DDPM(pl.LightningModule):
         # calculations for posterior q(x_{t-1} | x_t, x_0)
         posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                    1. - alphas_cumprod) + self.v_posterior * betas
         # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
         self.register_buffer('posterior_variance', to_torch(posterior_variance))
         # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
@@ -158,12 +177,14 @@ class DDPM(pl.LightningModule):
         if self.parameterization == "eps":
             lvlb_weights = self.betas ** 2 / (
-                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
         elif self.parameterization == "x0":
             lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
         else:
             raise NotImplementedError("mu not supported")
-        # TODO how to choose this term
         lvlb_weights[0] = lvlb_weights[1]
         self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
         assert not torch.isnan(self.lvlb_weights).all()
@@ -183,6 +204,7 @@ class DDPM(pl.LightningModule):
                 if context is not None:
                     print(f"{context}: Restored training weights")
     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         sd = torch.load(path, map_location="cpu")
         if "state_dict" in list(sd.keys()):
@@ -193,13 +215,57 @@ class DDPM(pl.LightningModule):
                 if k.startswith(ik):
                     print("Deleting key {} from state_dict.".format(k))
                     del sd[k]
         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
             sd, strict=False)
         print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
         if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
     def q_mean_variance(self, x_start, t):
         """
@@ -219,6 +285,20 @@ class DDPM(pl.LightningModule):
                 extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
         )
     def q_posterior(self, x_start, x_t, t):
         posterior_mean = (
                 extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
@@ -276,6 +356,12 @@ class DDPM(pl.LightningModule):
         return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
                 extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
     def get_loss(self, pred, target, mean=True):
         if self.loss_type == 'l1':
             loss = (target - pred).abs()
@@ -301,6 +387,8 @@ class DDPM(pl.LightningModule):
             target = noise
         elif self.parameterization == "x0":
             target = x_start
         else:
             raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
@@ -328,10 +416,10 @@ class DDPM(pl.LightningModule):
     def get_input(self, batch, k):
         x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
         return x
     def shared_step(self, batch):
@@ -421,41 +509,12 @@ class DDPM(pl.LightningModule):
         return opt
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, features_adapter=None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t, features_adapter=features_adapter)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t, features_adapter=features_adapter)
-        elif self.conditioning_key == 'crossattn':
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc, features_adapter=features_adapter)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, features_adapter=features_adapter)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc, features_adapter=features_adapter)
-        else:
-            raise NotImplementedError()
-        return out
 class LatentDiffusion(DDPM):
     """main class"""
     def __init__(self,
                  first_stage_config,
                  cond_stage_config,
-                 unet_config,
                  num_timesteps_cond=None,
                  cond_stage_key="image",
                  cond_stage_trainable=False,
@@ -474,9 +533,10 @@ class LatentDiffusion(DDPM):
         if cond_stage_config == '__is_unconditional__':
             conditioning_key = None
         ckpt_path = kwargs.pop("ckpt_path", None)
         ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, unet_config=unet_config, *args, **kwargs)
-        self.model = DiffusionWrapper(unet_config, conditioning_key)
         self.concat_mode = concat_mode
         self.cond_stage_trainable = cond_stage_trainable
         self.cond_stage_key = cond_stage_key
@@ -492,35 +552,27 @@ class LatentDiffusion(DDPM):
         self.instantiate_cond_stage(cond_stage_config)
         self.cond_stage_forward = cond_stage_forward
         self.clip_denoised = False
-        self.bbox_tokenizer = None
         self.restarted_from_ckpt = False
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys)
             self.restarted_from_ckpt = True
     def make_cond_schedule(self, ):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
         self.cond_ids[:self.num_timesteps_cond] = ids
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
     def register_schedule(self,
                           given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
@@ -562,7 +614,7 @@ class LatentDiffusion(DDPM):
         denoise_row = []
         for zd in tqdm(samples, desc=desc):
             denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                            force_not_quantize=force_no_decoder_quantization))
         n_imgs_per_row = len(denoise_row)
         denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
         denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
@@ -695,9 +747,9 @@ class LatentDiffusion(DDPM):
             if cond_key is None:
                 cond_key = self.cond_stage_key
             if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox']:
                     xc = batch[cond_key]
-                elif cond_key == 'class_label':
                     xc = batch
                 else:
                     xc = super().get_input(batch, cond_key).to(self.device)
@@ -742,181 +794,28 @@ class LatentDiffusion(DDPM):
             z = rearrange(z, 'b h w c -> b c h w').contiguous()
         z = 1. / self.scale_factor * z
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-    # same as above but without decorator
-    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-        z = 1. / self.scale_factor * z
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
     @torch.no_grad()
     def encode_first_stage(self, x):
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                df = self.split_input_params["vqf"]
-                self.split_input_params['original_image_size'] = x.shape[-2:]
-                bs, nc, h, w = x.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
-                z = unfold(x)  # (bn, nc * prod(**ks), L)
-                # Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
-                               for i in range(z.shape[-1])]
-                o = torch.stack(output_list, axis=-1)
-                o = o * weighting
-                # Reverse reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization
-                return decoded
-            else:
-                return self.first_stage_model.encode(x)
-        else:
-            return self.first_stage_model.encode(x)
     def shared_step(self, batch, **kwargs):
         x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
         return loss
-    def forward(self, x, c, features_adapter=None, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        return self.p_losses(x, c, t, features_adapter, *args, **kwargs)
-    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
-        def rescale_bbox(bbox):
-            x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
-            y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
-            w = min(bbox[2] / crop_coordinates[2], 1 - x0)
-            h = min(bbox[3] / crop_coordinates[3], 1 - y0)
-            return x0, y0, w, h
-        return [rescale_bbox(b) for b in bboxes]
-    def apply_model(self, x_noisy, t, cond, features_adapter=None, return_ids=False):
         if isinstance(cond, dict):
-            # hybrid case, cond is exptected to be a dict
             pass
         else:
             if not isinstance(cond, list):
@@ -924,98 +823,7 @@ class LatentDiffusion(DDPM):
             key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
             cond = {key: cond}
-        if hasattr(self, "split_input_params"):
-            assert len(cond) == 1  # todo can only deal with one conditioning atm
-            assert not return_ids
-            ks = self.split_input_params["ks"]  # eg. (128, 128)
-            stride = self.split_input_params["stride"]  # eg. (64, 64)
-            h, w = x_noisy.shape[-2:]
-            fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
-            z = unfold(x_noisy)  # (bn, nc * prod(**ks), L)
-            # Reshape to img shape
-            z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-            z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
-            if self.cond_stage_key in ["image", "LR_image", "segmentation",
-                                       'bbox_img'] and self.model.conditioning_key:  # todo check for completeness
-                c_key = next(iter(cond.keys()))  # get key
-                c = next(iter(cond.values()))  # get value
-                assert (len(c) == 1)  # todo extend to list with more than one elem
-                c = c[0]  # get element
-                c = unfold(c)
-                c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-                cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
-            elif self.cond_stage_key == 'coordinates_bbox':
-                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
-                # assuming padding of unfold is always 0 and its dilation is always 1
-                n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
-                full_img_h, full_img_w = self.split_input_params['original_image_size']
-                # as we are operating on latents, we need the factor from the original image size to the
-                # spatial latent size to properly rescale the crops for regenerating the bbox annotations
-                num_downs = self.first_stage_model.encoder.num_resolutions - 1
-                rescale_latent = 2 ** (num_downs)
-                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
-                # need to rescale the tl patch coordinates to be in between (0,1)
-                tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
-                                         rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
-                                        for patch_nr in range(z.shape[-1])]
-                # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
-                patch_limits = [(x_tl, y_tl,
-                                 rescale_latent * ks[0] / full_img_w,
-                                 rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
-                # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
-                # tokenize crop coordinates for the bounding boxes of the respective patches
-                patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
-                                      for bbox in patch_limits]  # list of length l with tensors of shape (1, 2)
-                print(patch_limits_tknzd[0].shape)
-                # cut tknzd crop position from conditioning
-                assert isinstance(cond, dict), 'cond must be dict to be fed into model'
-                cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
-                print(cut_cond.shape)
-                adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
-                adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
-                print(adapted_cond.shape)
-                adapted_cond = self.get_learned_conditioning(adapted_cond)
-                print(adapted_cond.shape)
-                adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
-                print(adapted_cond.shape)
-                cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
-            else:
-                cond_list = [cond for i in range(z.shape[-1])]  # Todo make this more efficient
-            # apply model by loop over crops
-            if features_adapter is not None:
-                output_list = [self.model(z_list[i], t, **cond_list[i], features_adapter=features_adapter) for i in range(z.shape[-1])]
-            else:
-                output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
-            assert not isinstance(output_list[0],
-                                  tuple)  # todo cant deal with multiple model outputs check this never happens
-            o = torch.stack(output_list, axis=-1)
-            o = o * weighting
-            # Reverse reshape to img shape
-            o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-            # stitch crops together
-            x_recon = fold(o) / normalization
-        else:
-            if features_adapter is not None:
-                x_recon = self.model(x_noisy, t, **cond, features_adapter=features_adapter)
-            else:
-                x_recon = self.model(x_noisy, t, **cond)
         if isinstance(x_recon, tuple) and not return_ids:
             return x_recon[0]
@@ -1040,10 +848,10 @@ class LatentDiffusion(DDPM):
         kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
         return mean_flat(kl_prior) / np.log(2.0)
-    def p_losses(self, x_start, cond, t, features_adapter=None, noise=None):
         noise = default(noise, lambda: torch.randn_like(x_start))
         x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond, features_adapter)
         loss_dict = {}
         prefix = 'train' if self.training else 'val'
@@ -1052,6 +860,8 @@ class LatentDiffusion(DDPM):
             target = x_start
         elif self.parameterization == "eps":
             target = noise
         else:
             raise NotImplementedError()
@@ -1247,7 +1057,7 @@ class LatentDiffusion(DDPM):
     @torch.no_grad()
     def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
                verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None,**kwargs):
         if shape is None:
             shape = (batch_size, self.channels, self.image_size, self.image_size)
         if cond is not None:
@@ -1263,26 +1073,51 @@ class LatentDiffusion(DDPM):
                                   mask=mask, x0=x0)
     @torch.no_grad()
-    def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
         if ddim:
             ddim_sampler = DDIMSampler(self)
             shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
-                                                        shape,cond,verbose=False,**kwargs)
         else:
             samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True,**kwargs)
         return samples, intermediates
     @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
                    quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, **kwargs):
         use_ddim = ddim_steps is not None
         log = dict()
@@ -1299,12 +1134,16 @@ class LatentDiffusion(DDPM):
             if hasattr(self.cond_stage_model, "decode"):
                 xc = self.cond_stage_model.decode(c)
                 log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
                 log["conditioning"] = xc
-            elif self.cond_stage_key == 'class_label':
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-                log['conditioning'] = xc
             elif isimage(xc):
                 log["conditioning"] = xc
             if ismap(xc):
@@ -1330,9 +1169,9 @@ class LatentDiffusion(DDPM):
         if sample:
             # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                         ddim_steps=ddim_steps,eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
@@ -1343,39 +1182,52 @@ class LatentDiffusion(DDPM):
             if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
                     self.first_stage_model, IdentityFirstStage):
                 # also display when quantizing x0 while sampling
-                with self.ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                             ddim_steps=ddim_steps,eta=ddim_eta,
                                                              quantize_denoised=True)
                     # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
                     #                                      quantize_denoised=True)
                 x_samples = self.decode_first_stage(samples.to(self.device))
                 log["samples_x0_quantized"] = x_samples
-            if inpaint:
-                # make a simple center square
-                b, h, w = z.shape[0], z.shape[2], z.shape[3]
-                mask = torch.ones(N, h, w).to(self.device)
-                # zeros will be filled in
-                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-                mask = mask[:, None, ...]
-                with self.ema_scope("Plotting Inpaint"):
-                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_inpainting"] = x_samples
-                log["mask"] = mask
-                # outpaint
-                with self.ema_scope("Plotting Outpaint"):
-                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_outpainting"] = x_samples
         if plot_progressive_rows:
-            with self.ema_scope("Plotting Progressives"):
                 img, progressives = self.progressive_denoising(c,
                                                                shape=(self.channels, self.image_size, self.image_size),
                                                                batch_size=N)
@@ -1422,25 +1274,40 @@ class LatentDiffusion(DDPM):
         x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
         return x
-class Layout2ImgDiffusion(LatentDiffusion):
-    # TODO: move all layout-specific hacks to this class
-    def __init__(self, cond_stage_key, *args, **kwargs):
-        assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
-        super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
-    def log_images(self, batch, N=8, *args, **kwargs):
-        logs = super().log_images(batch=batch, N=N, *args, **kwargs)
-        key = 'train' if self.training else 'validation'
-        dset = self.trainer.datamodule.datasets[key]
-        mapper = dset.conditional_builders[self.cond_stage_key]
-        bbox_imgs = []
-        map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
-        for tknzd_bbox in batch[self.cond_stage_key][:N]:
-            bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
-            bbox_imgs.append(bboximg)
-        cond_img = torch.stack(bbox_imgs, dim=0)
-        logs['bbox_image'] = cond_img
-        return logs

 import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
 from einops import rearrange, repeat
+from contextlib import contextmanager, nullcontext
 from functools import partial
+import itertools
 from tqdm import tqdm
 from torchvision.utils import make_grid
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from omegaconf import ListConfig
 from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
 from ldm.modules.ema import LitEma
 from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
+from ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
 from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
 from ldm.models.diffusion.ddim import DDIMSampler
                  use_positional_encodings=False,
                  learn_logvar=False,
                  logvar_init=0.,
+                 make_it_fit=False,
+                 ucg_training=None,
+                 reset_ema=False,
+                 reset_num_ema_updates=False,
                  ):
         super().__init__()
+        assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
         self.parameterization = parameterization
         print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
         self.cond_stage_model = None
         if monitor is not None:
             self.monitor = monitor
+        self.make_it_fit = make_it_fit
+        if reset_ema: assert exists(ckpt_path)
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+            if reset_ema:
+                assert self.use_ema
+                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
         self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
                                linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
         if self.learn_logvar:
             self.logvar = nn.Parameter(self.logvar, requires_grad=True)
+        self.ucg_training = ucg_training or dict()
+        if self.ucg_training:
+            self.ucg_prng = np.random.RandomState()
     def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
         # calculations for posterior q(x_{t-1} | x_t, x_0)
         posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
+                1. - alphas_cumprod) + self.v_posterior * betas
         # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
         self.register_buffer('posterior_variance', to_torch(posterior_variance))
         # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
         if self.parameterization == "eps":
             lvlb_weights = self.betas ** 2 / (
+                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
         elif self.parameterization == "x0":
             lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
+        elif self.parameterization == "v":
+            lvlb_weights = torch.ones_like(self.betas ** 2 / (
+                    2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
         else:
             raise NotImplementedError("mu not supported")
         lvlb_weights[0] = lvlb_weights[1]
         self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
         assert not torch.isnan(self.lvlb_weights).all()
                 if context is not None:
                     print(f"{context}: Restored training weights")
+    @torch.no_grad()
     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
         sd = torch.load(path, map_location="cpu")
         if "state_dict" in list(sd.keys()):
                 if k.startswith(ik):
                     print("Deleting key {} from state_dict.".format(k))
                     del sd[k]
+        if self.make_it_fit:
+            n_params = len([name for name, _ in
+                            itertools.chain(self.named_parameters(),
+                                            self.named_buffers())])
+            for name, param in tqdm(
+                    itertools.chain(self.named_parameters(),
+                                    self.named_buffers()),
+                    desc="Fitting old weights to new weights",
+                    total=n_params
+            ):
+                if not name in sd:
+                    continue
+                old_shape = sd[name].shape
+                new_shape = param.shape
+                assert len(old_shape) == len(new_shape)
+                if len(new_shape) > 2:
+                    # we only modify first two axes
+                    assert new_shape[2:] == old_shape[2:]
+                # assumes first axis corresponds to output dim
+                if not new_shape == old_shape:
+                    new_param = param.clone()
+                    old_param = sd[name]
+                    if len(new_shape) == 1:
+                        for i in range(new_param.shape[0]):
+                            new_param[i] = old_param[i % old_shape[0]]
+                    elif len(new_shape) >= 2:
+                        for i in range(new_param.shape[0]):
+                            for j in range(new_param.shape[1]):
+                                new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
+                        n_used_old = torch.ones(old_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_old[j % old_shape[1]] += 1
+                        n_used_new = torch.zeros(new_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_new[j] = n_used_old[j % old_shape[1]]
+                        n_used_new = n_used_new[None, :]
+                        while len(n_used_new.shape) < len(new_shape):
+                            n_used_new = n_used_new.unsqueeze(-1)
+                        new_param /= n_used_new
+                    sd[name] = new_param
         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
             sd, strict=False)
         print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
+            print(f"Missing Keys:\n {missing}")
         if len(unexpected) > 0:
+            print(f"\nUnexpected Keys:\n {unexpected}")
     def q_mean_variance(self, x_start, t):
         """
                 extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
         )
+    def predict_start_from_z_and_v(self, x_t, t, v):
+        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+    def predict_eps_from_z_and_v(self, x_t, t, v):
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
+        )
     def q_posterior(self, x_start, x_t, t):
         posterior_mean = (
                 extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
         return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
                 extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+    def get_v(self, x, noise, t):
+        return (
+                extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
+        )
     def get_loss(self, pred, target, mean=True):
         if self.loss_type == 'l1':
             loss = (target - pred).abs()
             target = noise
         elif self.parameterization == "x0":
             target = x_start
+        elif self.parameterization == "v":
+            target = self.get_v(x_start, noise, t)
         else:
             raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
     def get_input(self, batch, k):
         x = batch[k]
+        # if len(x.shape) == 3:
+        #     x = x[..., None]
+        # x = rearrange(x, 'b h w c -> b c h w')
+        # x = x.to(memory_format=torch.contiguous_format).float()
         return x
     def shared_step(self, batch):
         return opt
 class LatentDiffusion(DDPM):
     """main class"""
     def __init__(self,
                  first_stage_config,
                  cond_stage_config,
                  num_timesteps_cond=None,
                  cond_stage_key="image",
                  cond_stage_trainable=False,
         if cond_stage_config == '__is_unconditional__':
             conditioning_key = None
         ckpt_path = kwargs.pop("ckpt_path", None)
+        reset_ema = kwargs.pop("reset_ema", False)
+        reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False)
         ignore_keys = kwargs.pop("ignore_keys", [])
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
         self.concat_mode = concat_mode
         self.cond_stage_trainable = cond_stage_trainable
         self.cond_stage_key = cond_stage_key
         self.instantiate_cond_stage(cond_stage_config)
         self.cond_stage_forward = cond_stage_forward
         self.clip_denoised = False
+        self.bbox_tokenizer = None
         self.restarted_from_ckpt = False
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys)
             self.restarted_from_ckpt = True
+            if reset_ema:
+                assert self.use_ema
+                print(
+                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
     def make_cond_schedule(self, ):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
         self.cond_ids[:self.num_timesteps_cond] = ids
     def register_schedule(self,
                           given_betas=None, beta_schedule="linear", timesteps=1000,
                           linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
         denoise_row = []
         for zd in tqdm(samples, desc=desc):
             denoise_row.append(self.decode_first_stage(zd.to(self.device),
+                                                       force_not_quantize=force_no_decoder_quantization))
         n_imgs_per_row = len(denoise_row)
         denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
         denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
             if cond_key is None:
                 cond_key = self.cond_stage_key
             if cond_key != self.first_stage_key:
+                if cond_key in ['caption', 'coordinates_bbox', "txt"]:
                     xc = batch[cond_key]
+                elif cond_key in ['class_label', 'cls']:
                     xc = batch
                 else:
                     xc = super().get_input(batch, cond_key).to(self.device)
             z = rearrange(z, 'b h w c -> b c h w').contiguous()
         z = 1. / self.scale_factor * z
+        return self.first_stage_model.decode(z)
     @torch.no_grad()
     def encode_first_stage(self, x):
+        return self.first_stage_model.encode(x)
     def shared_step(self, batch, **kwargs):
         x, c = self.get_input(batch, self.first_stage_key)
+        loss = self(x, c, **kwargs)
         return loss
+    def forward(self, x, c, *args, **kwargs):
+        if 't' not in kwargs:
+            t = torch.randint(0, self.num_timesteps, (x.shape[0], ), device=self.device).long()
+        else:
+            t = kwargs.pop('t')
+        return self.p_losses(x, c, t, *args, **kwargs)
+    def apply_model(self, x_noisy, t, cond, return_ids=False, **kwargs):
         if isinstance(cond, dict):
+            # hybrid case, cond is expected to be a dict
             pass
         else:
             if not isinstance(cond, list):
             key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
             cond = {key: cond}
+        x_recon = self.model(x_noisy, t, **cond, **kwargs)
         if isinstance(x_recon, tuple) and not return_ids:
             return x_recon[0]
         kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
         return mean_flat(kl_prior) / np.log(2.0)
+    def p_losses(self, x_start, cond, t, noise=None, **kwargs):
         noise = default(noise, lambda: torch.randn_like(x_start))
         x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_output = self.apply_model(x_noisy, t, cond, **kwargs)
         loss_dict = {}
         prefix = 'train' if self.training else 'val'
             target = x_start
         elif self.parameterization == "eps":
             target = noise
+        elif self.parameterization == "v":
+            target = self.get_v(x_start, noise, t)
         else:
             raise NotImplementedError()
     @torch.no_grad()
     def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
                verbose=True, timesteps=None, quantize_denoised=False,
+               mask=None, x0=None, shape=None, **kwargs):
         if shape is None:
             shape = (batch_size, self.channels, self.image_size, self.image_size)
         if cond is not None:
                                   mask=mask, x0=x0)
     @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
         if ddim:
             ddim_sampler = DDIMSampler(self)
             shape = (self.channels, self.image_size, self.image_size)
+            samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
+                                                         shape, cond, verbose=False, **kwargs)
         else:
             samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
+                                                 return_intermediates=True, **kwargs)
         return samples, intermediates
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, batch_size, null_label=None):
+        if null_label is not None:
+            xc = null_label
+            if isinstance(xc, ListConfig):
+                xc = list(xc)
+            if isinstance(xc, dict) or isinstance(xc, list):
+                c = self.get_learned_conditioning(xc)
+            else:
+                if hasattr(xc, "to"):
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(xc)
+        else:
+            if self.cond_stage_key in ["class_label", "cls"]:
+                xc = self.cond_stage_model.get_unconditional_conditioning(batch_size, device=self.device)
+                return self.get_learned_conditioning(xc)
+            else:
+                raise NotImplementedError("todo")
+        if isinstance(c, list):  # in case the encoder gives us a list
+            for i in range(len(c)):
+                c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
+        else:
+            c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
+        return c
     @torch.no_grad()
+    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
                    quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        ema_scope = self.ema_scope if use_ema_scope else nullcontext
         use_ddim = ddim_steps is not None
         log = dict()
             if hasattr(self.cond_stage_model, "decode"):
                 xc = self.cond_stage_model.decode(c)
                 log["conditioning"] = xc
+            elif self.cond_stage_key in ["caption", "txt"]:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
                 log["conditioning"] = xc
+            elif self.cond_stage_key in ['class_label', "cls"]:
+                try:
+                    xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
+                    log['conditioning'] = xc
+                except KeyError:
+                    # probably no "human_label" in batch
+                    pass
             elif isimage(xc):
                 log["conditioning"] = xc
             if ismap(xc):
         if sample:
             # get denoise row
+            with ema_scope("Sampling"):
+                samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                         ddim_steps=ddim_steps, eta=ddim_eta)
                 # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
             x_samples = self.decode_first_stage(samples)
             log["samples"] = x_samples
             if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
                     self.first_stage_model, IdentityFirstStage):
                 # also display when quantizing x0 while sampling
+                with ema_scope("Plotting Quantized Denoised"):
+                    samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                             ddim_steps=ddim_steps, eta=ddim_eta,
                                                              quantize_denoised=True)
                     # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
                     #                                      quantize_denoised=True)
                 x_samples = self.decode_first_stage(samples.to(self.device))
                 log["samples_x0_quantized"] = x_samples
+        if unconditional_guidance_scale > 1.0:
+            uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
+            if self.model.conditioning_key == "crossattn-adm":
+                uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
+            with ema_scope("Sampling with classifier-free guidance"):
+                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                 ddim_steps=ddim_steps, eta=ddim_eta,
+                                                 unconditional_guidance_scale=unconditional_guidance_scale,
+                                                 unconditional_conditioning=uc,
+                                                 )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        if inpaint:
+            # make a simple center square
+            b, h, w = z.shape[0], z.shape[2], z.shape[3]
+            mask = torch.ones(N, h, w).to(self.device)
+            # zeros will be filled in
+            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
+            mask = mask[:, None, ...]
+            with ema_scope("Plotting Inpaint"):
+                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
+                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_inpainting"] = x_samples
+            log["mask"] = mask
+            # outpaint
+            mask = 1. - mask
+            with ema_scope("Plotting Outpaint"):
+                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
+                                             ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_outpainting"] = x_samples
         if plot_progressive_rows:
+            with ema_scope("Plotting Progressives"):
                 img, progressives = self.progressive_denoising(c,
                                                                shape=(self.channels, self.image_size, self.image_size),
                                                                batch_size=N)
         x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
         return x
+class DiffusionWrapper(pl.LightningModule):
+    def __init__(self, diff_model_config, conditioning_key):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.conditioning_key = conditioning_key
+        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
+    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None, **kwargs):
+        if self.conditioning_key is None:
+            out = self.diffusion_model(x, t, **kwargs)
+        elif self.conditioning_key == 'concat':
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t, **kwargs)
+        elif self.conditioning_key == 'crossattn':
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, **kwargs)
+        elif self.conditioning_key == 'hybrid':
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, **kwargs)
+        elif self.conditioning_key == 'hybrid-adm':
+            assert c_adm is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, y=c_adm, **kwargs)
+        elif self.conditioning_key == 'crossattn-adm':
+            assert c_adm is not None
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, y=c_adm, **kwargs)
+        elif self.conditioning_key == 'adm':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, y=cc, **kwargs)
+        else:
+            raise NotImplementedError()
+        return out

ldm/models/diffusion/dpm_solver/dpm_solver.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import torch.nn.functional as F
 import math
 class NoiseScheduleVP:
@@ -11,7 +12,7 @@ class NoiseScheduleVP:
             alphas_cumprod=None,
             continuous_beta_0=0.1,
             continuous_beta_1=20.,
-        ):
         """Create a wrapper class for the forward SDE (VP type).
         ***
@@ -93,7 +94,9 @@ class NoiseScheduleVP:
         """
         if schedule not in ['discrete', 'linear', 'cosine']:
-            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
         self.schedule = schedule
         if schedule == 'discrete':
@@ -112,7 +115,8 @@ class NoiseScheduleVP:
             self.beta_1 = continuous_beta_1
             self.cosine_s = 0.008
             self.cosine_beta_max = 999.
-            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
             self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
             self.schedule = schedule
             if schedule == 'cosine':
@@ -127,12 +131,13 @@ class NoiseScheduleVP:
         Compute log(alpha_t) of a given continuous-time label t in [0, T].
         """
         if self.schedule == 'discrete':
-            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
         elif self.schedule == 'linear':
             return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
         elif self.schedule == 'cosine':
             log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
-            log_alpha_t =  log_alpha_fn(t) - self.cosine_log_alpha_0
             return log_alpha_t
     def marginal_alpha(self, t):
@@ -161,30 +166,32 @@ class NoiseScheduleVP:
         """
         if self.schedule == 'linear':
             tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            Delta = self.beta_0**2 + tmp
             return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
         elif self.schedule == 'discrete':
             log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
-            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
             return t.reshape((-1,))
         else:
             log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
             t = t_fn(log_alpha)
             return t
 def model_wrapper(
-    model,
-    noise_schedule,
-    model_type="noise",
-    model_kwargs={},
-    guidance_type="uncond",
-    condition=None,
-    unconditional_condition=None,
-    guidance_scale=1.,
-    classifier_fn=None,
-    classifier_kwargs={},
 ):
     """Create a wrapper function for the noise prediction model.
@@ -392,7 +399,7 @@ class DPM_Solver:
         alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
         x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
         if self.thresholding:
-            p = 0.995   # A hyperparameter in the paper of "Imagen" [1].
             s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
             s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
             x0 = torch.clamp(x0, -s, s) / s
@@ -431,10 +438,11 @@ class DPM_Solver:
             return torch.linspace(t_T, t_0, N + 1).to(device)
         elif skip_type == 'time_quadratic':
             t_order = 2
-            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
             return t
         else:
-            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
     def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
         """
@@ -471,28 +479,29 @@ class DPM_Solver:
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
-                orders = [3,] * (K - 2) + [2, 1]
             elif steps % 3 == 1:
-                orders = [3,] * (K - 1) + [1]
             else:
-                orders = [3,] * (K - 1) + [2]
         elif order == 2:
             if steps % 2 == 0:
                 K = steps // 2
-                orders = [2,] * K
             else:
                 K = steps // 2 + 1
-                orders = [2,] * (K - 1) + [1]
         elif order == 1:
             K = 1
-            orders = [1,] * steps
         else:
             raise ValueError("'order' must be '1' or '2' or '3'.")
         if skip_type == 'logSNR':
             # To reproduce the results in DPM-Solver paper
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
         else:
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders)).to(device)]
         return timesteps_outer, orders
     def denoise_to_zero_fn(self, x, s):
@@ -528,8 +537,8 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
-                expand_dims(sigma_t / sigma_s, dims) * x
-                - expand_dims(alpha_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
@@ -540,15 +549,16 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                - expand_dims(sigma_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
             else:
                 return x_t
-    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-2 from time `s` to time `t`.
@@ -575,7 +585,8 @@ class DPM_Solver:
         h = lambda_t - lambda_s
         lambda_s1 = lambda_s + r1 * h
         s1 = ns.inverse_lambda(lambda_s1)
-        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t)
         sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
         alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
@@ -586,21 +597,22 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
-                expand_dims(sigma_s1 / sigma_s, dims) * x
-                - expand_dims(alpha_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (model_s1 - model_s)
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
@@ -609,28 +621,29 @@ class DPM_Solver:
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
-                expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                - expand_dims(sigma_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
                 )
         if return_intermediate:
             return x_t, {'model_s': model_s, 'model_s1': model_s1}
         else:
             return x_t
-    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1./3., r2=2./3., model_s=None, model_s1=None, return_intermediate=False, solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-3 from time `s` to time `t`.
@@ -664,8 +677,10 @@ class DPM_Solver:
         lambda_s2 = lambda_s + r2 * h
         s1 = ns.inverse_lambda(lambda_s1)
         s2 = ns.inverse_lambda(lambda_s2)
-        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
-        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t)
         alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
         if self.predict_x0:
@@ -680,21 +695,21 @@ class DPM_Solver:
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
-                    expand_dims(sigma_s1 / sigma_s, dims) * x
-                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
-                expand_dims(sigma_s2 / sigma_s, dims) * x
-                - expand_dims(alpha_s2 * phi_12, dims) * model_s
-                + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -702,10 +717,10 @@ class DPM_Solver:
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
-                    expand_dims(sigma_t / sigma_s, dims) * x
-                    - expand_dims(alpha_t * phi_1, dims) * model_s
-                    + expand_dims(alpha_t * phi_2, dims) * D1
-                    - expand_dims(alpha_t * phi_3, dims) * D2
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
@@ -719,21 +734,21 @@ class DPM_Solver:
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
-                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
-                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
-                expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
-                - expand_dims(sigma_s2 * phi_12, dims) * model_s
-                - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -741,10 +756,10 @@ class DPM_Solver:
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
-                    - expand_dims(sigma_t * phi_1, dims) * model_s
-                    - expand_dims(sigma_t * phi_2, dims) * D1
-                    - expand_dims(sigma_t * phi_3, dims) * D2
                 )
         if return_intermediate:
@@ -772,7 +787,8 @@ class DPM_Solver:
         dims = x.dim()
         model_prev_1, model_prev_0 = model_prev_list
         t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
@@ -784,28 +800,28 @@ class DPM_Solver:
         if self.predict_x0:
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(sigma_t / sigma_prev_0, dims) * x
-                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
                 )
         else:
             if solver_type == 'dpm_solver':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
-                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
                 )
         return x_t
@@ -827,7 +843,8 @@ class DPM_Solver:
         dims = x.dim()
         model_prev_2, model_prev_1, model_prev_0 = model_prev_list
         t_prev_2, t_prev_1, t_prev_0 = t_prev_list
-        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
@@ -842,21 +859,22 @@ class DPM_Solver:
         D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
         if self.predict_x0:
             x_t = (
-                expand_dims(sigma_t / sigma_prev_0, dims) * x
-                - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
-                + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
-                - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h**2 - 0.5), dims) * D2
             )
         else:
             x_t = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
-                - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
-                - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h**2 - 0.5), dims) * D2
             )
         return x_t
-    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None, r2=None):
         """
         Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
@@ -876,9 +894,11 @@ class DPM_Solver:
         if order == 1:
             return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
         elif order == 2:
-            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1)
         elif order == 3:
-            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2)
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
@@ -906,7 +926,8 @@ class DPM_Solver:
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpm_solver'):
         """
         The adaptive step size solver based on singlestep DPM-Solver.
@@ -938,11 +959,17 @@ class DPM_Solver:
         if order == 2:
             r1 = 0.5
             lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
         elif order == 3:
             r1, r2 = 1. / 3., 2. / 3.
-            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
-            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
         else:
             raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
         while torch.abs((s - t_0)).mean() > t_err:
@@ -963,9 +990,9 @@ class DPM_Solver:
         return x
     def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
-        method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
-        atol=0.0078, rtol=0.05,
-    ):
         """
         Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
@@ -1073,7 +1100,8 @@ class DPM_Solver:
         device = x.device
         if method == 'adaptive':
             with torch.no_grad():
-                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type)
         elif method == 'multistep':
             assert steps >= order
             timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
@@ -1083,19 +1111,21 @@ class DPM_Solver:
                 model_prev_list = [self.model_fn(x, vec_t)]
                 t_prev_list = [vec_t]
                 # Init the first `order` values by lower order multistep DPM-Solver.
-                for init_order in range(1, order):
                     vec_t = timesteps[init_order].expand(x.shape[0])
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order, solver_type=solver_type)
                     model_prev_list.append(self.model_fn(x, vec_t))
                     t_prev_list.append(vec_t)
                 # Compute the remaining values by `order`-th order multistep DPM-Solver.
-                for step in range(order, steps + 1):
                     vec_t = timesteps[step].expand(x.shape[0])
                     if lower_order_final and steps < 15:
                         step_order = min(order, steps + 1 - step)
                     else:
                         step_order = order
-                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order, solver_type=solver_type)
                     for i in range(order - 1):
                         t_prev_list[i] = t_prev_list[i + 1]
                         model_prev_list[i] = model_prev_list[i + 1]
@@ -1105,14 +1135,18 @@ class DPM_Solver:
                         model_prev_list[-1] = self.model_fn(x, vec_t)
         elif method in ['singlestep', 'singlestep_fixed']:
             if method == 'singlestep':
-                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device)
             elif method == 'singlestep_fixed':
                 K = steps // order
-                orders = [order,] * K
                 timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
             for i, order in enumerate(orders):
                 t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
-                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(), N=order, device=device)
                 lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
                 vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
                 h = lambda_inner[-1] - lambda_inner[0]
@@ -1124,7 +1158,6 @@ class DPM_Solver:
         return x
 #############################################################
 # other utility functions
 #############################################################
@@ -1181,4 +1214,4 @@ def expand_dims(v, dims):
     Returns:
         a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
     """
-    return v[(...,) + (None,)*(dims - 1)]

 import torch
 import torch.nn.functional as F
 import math
+from tqdm import tqdm
 class NoiseScheduleVP:
             alphas_cumprod=None,
             continuous_beta_0=0.1,
             continuous_beta_1=20.,
+    ):
         """Create a wrapper class for the forward SDE (VP type).
         ***
         """
         if schedule not in ['discrete', 'linear', 'cosine']:
+            raise ValueError(
+                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
+                    schedule))
         self.schedule = schedule
         if schedule == 'discrete':
             self.beta_1 = continuous_beta_1
             self.cosine_s = 0.008
             self.cosine_beta_max = 999.
+            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
             self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
             self.schedule = schedule
             if schedule == 'cosine':
         Compute log(alpha_t) of a given continuous-time label t in [0, T].
         """
         if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
+                                  self.log_alpha_array.to(t.device)).reshape((-1))
         elif self.schedule == 'linear':
             return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
         elif self.schedule == 'cosine':
             log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
+            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
             return log_alpha_t
     def marginal_alpha(self, t):
         """
         if self.schedule == 'linear':
             tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0 ** 2 + tmp
             return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
         elif self.schedule == 'discrete':
             log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                               torch.flip(self.t_array.to(lamb.device), [1]))
             return t.reshape((-1,))
         else:
             log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
             t = t_fn(log_alpha)
             return t
 def model_wrapper(
+        model,
+        noise_schedule,
+        model_type="noise",
+        model_kwargs={},
+        guidance_type="uncond",
+        condition=None,
+        unconditional_condition=None,
+        guidance_scale=1.,
+        classifier_fn=None,
+        classifier_kwargs={},
 ):
     """Create a wrapper function for the noise prediction model.
         alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
         x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
         if self.thresholding:
+            p = 0.995  # A hyperparameter in the paper of "Imagen" [1].
             s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
             s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
             x0 = torch.clamp(x0, -s, s) / s
             return torch.linspace(t_T, t_0, N + 1).to(device)
         elif skip_type == 'time_quadratic':
             t_order = 2
+            t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
             return t
         else:
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
     def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
         """
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
+                orders = [3, ] * (K - 2) + [2, 1]
             elif steps % 3 == 1:
+                orders = [3, ] * (K - 1) + [1]
             else:
+                orders = [3, ] * (K - 1) + [2]
         elif order == 2:
             if steps % 2 == 0:
                 K = steps // 2
+                orders = [2, ] * K
             else:
                 K = steps // 2 + 1
+                orders = [2, ] * (K - 1) + [1]
         elif order == 1:
             K = 1
+            orders = [1, ] * steps
         else:
             raise ValueError("'order' must be '1' or '2' or '3'.")
         if skip_type == 'logSNR':
             # To reproduce the results in DPM-Solver paper
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
         else:
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
+                torch.cumsum(torch.tensor([0, ] + orders)).to(device)]
         return timesteps_outer, orders
     def denoise_to_zero_fn(self, x, s):
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
+                    expand_dims(sigma_t / sigma_s, dims) * x
+                    - expand_dims(alpha_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_t = (
+                    expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                    - expand_dims(sigma_t * phi_1, dims) * model_s
             )
             if return_intermediate:
                 return x_t, {'model_s': model_s}
             else:
                 return x_t
+    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
+                                            solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-2 from time `s` to time `t`.
         h = lambda_t - lambda_s
         lambda_s1 = lambda_s + r1 * h
         s1 = ns.inverse_lambda(lambda_s1)
+        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
+            s1), ns.marginal_log_mean_coeff(t)
         sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
         alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
+                    expand_dims(sigma_s1 / sigma_s, dims) * x
+                    - expand_dims(alpha_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
+                                    model_s1 - model_s)
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
             if model_s is None:
                 model_s = self.model_fn(x, s)
             x_s1 = (
+                    expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
+                    - expand_dims(sigma_s1 * phi_11, dims) * model_s
             )
             model_s1 = self.model_fn(x_s1, s1)
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
                 )
             elif solver_type == 'taylor':
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
                 )
         if return_intermediate:
             return x_t, {'model_s': model_s, 'model_s1': model_s1}
         else:
             return x_t
+    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
+                                           return_intermediate=False, solver_type='dpm_solver'):
         """
         Singlestep solver DPM-Solver-3 from time `s` to time `t`.
         lambda_s2 = lambda_s + r2 * h
         s1 = ns.inverse_lambda(lambda_s1)
         s2 = ns.inverse_lambda(lambda_s2)
+        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
+            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
+            s2), ns.marginal_std(t)
         alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
         if self.predict_x0:
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
+                        expand_dims(sigma_s1 / sigma_s, dims) * x
+                        - expand_dims(alpha_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
+                    expand_dims(sigma_s2 / sigma_s, dims) * x
+                    - expand_dims(alpha_s2 * phi_12, dims) * model_s
+                    + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
+                        expand_dims(sigma_t / sigma_s, dims) * x
+                        - expand_dims(alpha_t * phi_1, dims) * model_s
+                        + expand_dims(alpha_t * phi_2, dims) * D1
+                        - expand_dims(alpha_t * phi_3, dims) * D2
                 )
         else:
             phi_11 = torch.expm1(r1 * h)
                 model_s = self.model_fn(x, s)
             if model_s1 is None:
                 x_s1 = (
+                        expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
+                        - expand_dims(sigma_s1 * phi_11, dims) * model_s
                 )
                 model_s1 = self.model_fn(x_s1, s1)
             x_s2 = (
+                    expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
+                    - expand_dims(sigma_s2 * phi_12, dims) * model_s
+                    - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
             )
             model_s2 = self.model_fn(x_s2, s2)
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
                 )
             elif solver_type == 'taylor':
                 D1_0 = (1. / r1) * (model_s1 - model_s)
                 D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                 D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
+                        - expand_dims(sigma_t * phi_1, dims) * model_s
+                        - expand_dims(sigma_t * phi_2, dims) * D1
+                        - expand_dims(sigma_t * phi_3, dims) * D2
                 )
         if return_intermediate:
         dims = x.dim()
         model_prev_1, model_prev_0 = model_prev_list
         t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
+            t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
         if self.predict_x0:
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(sigma_t / sigma_prev_0, dims) * x
+                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                        - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
+                        expand_dims(sigma_t / sigma_prev_0, dims) * x
+                        - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                        + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
                 )
         else:
             if solver_type == 'dpm_solver':
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                        - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
                 )
             elif solver_type == 'taylor':
                 x_t = (
+                        expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                        - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                        - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
                 )
         return x_t
         dims = x.dim()
         model_prev_2, model_prev_1, model_prev_0 = model_prev_list
         t_prev_2, t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
+            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
         log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
         sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
         alpha_t = torch.exp(log_alpha_t)
         D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
         if self.predict_x0:
             x_t = (
+                    expand_dims(sigma_t / sigma_prev_0, dims) * x
+                    - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
+                    + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
+                    - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
             )
         else:
             x_t = (
+                    expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                    - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
+                    - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
+                    - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
             )
         return x_t
+    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
+                                     r2=None):
         """
         Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
         if order == 1:
             return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
         elif order == 2:
+            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
+                                                            solver_type=solver_type, r1=r1)
         elif order == 3:
+            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
+                                                           solver_type=solver_type, r1=r1, r2=r2)
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
         else:
             raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
+                            solver_type='dpm_solver'):
         """
         The adaptive step size solver based on singlestep DPM-Solver.
         if order == 2:
             r1 = 0.5
             lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                               solver_type=solver_type,
+                                                                                               **kwargs)
         elif order == 3:
             r1, r2 = 1. / 3., 2. / 3.
+            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                    return_intermediate=True,
+                                                                                    solver_type=solver_type)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
+                                                                                              solver_type=solver_type,
+                                                                                              **kwargs)
         else:
             raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
         while torch.abs((s - t_0)).mean() > t_err:
         return x
     def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
+               method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
+               atol=0.0078, rtol=0.05,
+               ):
         """
         Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
         device = x.device
         if method == 'adaptive':
             with torch.no_grad():
+                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
+                                             solver_type=solver_type)
         elif method == 'multistep':
             assert steps >= order
             timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
                 model_prev_list = [self.model_fn(x, vec_t)]
                 t_prev_list = [vec_t]
                 # Init the first `order` values by lower order multistep DPM-Solver.
+                for init_order in tqdm(range(1, order), desc="DPM init order"):
                     vec_t = timesteps[init_order].expand(x.shape[0])
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
+                                                         solver_type=solver_type)
                     model_prev_list.append(self.model_fn(x, vec_t))
                     t_prev_list.append(vec_t)
                 # Compute the remaining values by `order`-th order multistep DPM-Solver.
+                for step in tqdm(range(order, steps + 1), desc="DPM multistep"):
                     vec_t = timesteps[step].expand(x.shape[0])
                     if lower_order_final and steps < 15:
                         step_order = min(order, steps + 1 - step)
                     else:
                         step_order = order
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order,
+                                                         solver_type=solver_type)
                     for i in range(order - 1):
                         t_prev_list[i] = t_prev_list[i + 1]
                         model_prev_list[i] = model_prev_list[i + 1]
                         model_prev_list[-1] = self.model_fn(x, vec_t)
         elif method in ['singlestep', 'singlestep_fixed']:
             if method == 'singlestep':
+                timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
+                                                                                              skip_type=skip_type,
+                                                                                              t_T=t_T, t_0=t_0,
+                                                                                              device=device)
             elif method == 'singlestep_fixed':
                 K = steps // order
+                orders = [order, ] * K
                 timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
             for i, order in enumerate(orders):
                 t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
+                timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
+                                                      N=order, device=device)
                 lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
                 vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
                 h = lambda_inner[-1] - lambda_inner[0]
         return x
 #############################################################
 # other utility functions
 #############################################################
     Returns:
         a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
     """
+    return v[(...,) + (None,) * (dims - 1)]

ldm/models/diffusion/dpm_solver/sampler.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """SAMPLING ONLY."""
 import torch
 from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
 class DPMSolverSampler(object):
     def __init__(self, model, **kwargs):
         super().__init__()
@@ -56,7 +61,7 @@ class DPMSolverSampler(object):
         C, H, W = shape
         size = (batch_size, C, H, W)
-        # print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
         device = self.model.betas.device
         if x_T is None:
@@ -69,7 +74,7 @@ class DPMSolverSampler(object):
         model_fn = model_wrapper(
             lambda x, t, c: self.model.apply_model(x, t, c),
             ns,
-            model_type="noise",
             guidance_type="classifier-free",
             condition=conditioning,
             unconditional_condition=unconditional_conditioning,

 """SAMPLING ONLY."""
 import torch
 from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
+MODEL_TYPES = {
+    "eps": "noise",
+    "v": "v"
+}
 class DPMSolverSampler(object):
     def __init__(self, model, **kwargs):
         super().__init__()
         C, H, W = shape
         size = (batch_size, C, H, W)
+        print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
         device = self.model.betas.device
         if x_T is None:
         model_fn = model_wrapper(
             lambda x, t, c: self.model.apply_model(x, t, c),
             ns,
+            model_type=MODEL_TYPES[self.model.parameterization],
             guidance_type="classifier-free",
             condition=conditioning,
             unconditional_condition=unconditional_conditioning,

ldm/models/diffusion/plms.py CHANGED Viewed

@@ -3,10 +3,9 @@
 import torch
 import numpy as np
 from tqdm import tqdm
-from functools import partial
-import copy
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
 class PLMSSampler(object):
     def __init__(self, model, schedule="linear", **kwargs):
         super().__init__()
@@ -24,7 +23,7 @@ class PLMSSampler(object):
         if ddim_eta != 0:
             raise ValueError('ddim_eta must be 0 for PLMS')
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
@@ -43,14 +42,14 @@ class PLMSSampler(object):
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
     @torch.no_grad()
@@ -75,11 +74,8 @@ class PLMSSampler(object):
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
-               features_adapter1=None,
-               features_adapter2=None,
-               mode = 'sketch',
-               con_strength=30,
-               style_feature=None,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
@@ -113,11 +109,8 @@ class PLMSSampler(object):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
-                                                    features_adapter1=copy.deepcopy(features_adapter1),
-                                                    features_adapter2=copy.deepcopy(features_adapter2),
-                                                    mode = mode,
-                                                    con_strength = con_strength,
-                                                    style_feature=style_feature#.clone()
                                                     )
         return samples, intermediates
@@ -127,7 +120,8 @@ class PLMSSampler(object):
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,features_adapter1=None, features_adapter2=None, mode='sketch', con_strength=30, style_feature=None):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
@@ -141,7 +135,7 @@ class PLMSSampler(object):
             timesteps = self.ddim_timesteps[:subset_end]
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running PLMS Sampling with {total_steps} timesteps")
@@ -152,41 +146,21 @@ class PLMSSampler(object):
             index = total_steps - i - 1
             ts = torch.full((b,), step, device=device, dtype=torch.long)
             ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
-            cond_in = cond
-            unconditional_conditioning_in = unconditional_conditioning
-            if mask is not None :#and index>=10:
                 assert x0 is not None
                 img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
                 img = img_orig * mask + (1. - mask) * img
-            if mode == 'sketch':
-                if index<con_strength:
-                    features_adapter = None
-                else:
-                    features_adapter = features_adapter1
-            elif mode == 'style':
-                if index<con_strength:
-                    features_adapter = None
-                else:
-                    features_adapter = features_adapter1
-                if index>25:
-                    cond_in = torch.cat([cond, style_feature.clone()], dim=1)
-                    unconditional_conditioning_in = torch.cat(
-                        [unconditional_conditioning, unconditional_conditioning[:, -8:, :]], dim=1)
-            elif mode == 'mul':
-                features_adapter = [a1i*0.5 + a2i for a1i, a2i in zip(features_adapter1, features_adapter2)]
-            else:
-                features_adapter = features_adapter1
-            outs = self.p_sample_plms(img, cond_in, ts, index=index, use_original_steps=ddim_use_original_steps,
                                       quantize_denoised=quantize_denoised, temperature=temperature,
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning_in,
-                                      old_eps=old_eps, t_next=ts_next, features_adapter=copy.deepcopy(features_adapter))
             img, pred_x0, e_t = outs
             old_eps.append(e_t)
@@ -204,17 +178,18 @@ class PLMSSampler(object):
     @torch.no_grad()
     def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None, features_adapter=None):
         b, *_, device = *x.shape, x.device
         def get_model_output(x, t):
             if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-                e_t = self.model.apply_model(x, t, c, copy.deepcopy(features_adapter))
             else:
                 x_in = torch.cat([x] * 2)
                 t_in = torch.cat([t] * 2)
                 c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in, copy.deepcopy(features_adapter)).chunk(2)
                 e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
             if score_corrector is not None:
@@ -233,14 +208,14 @@ class PLMSSampler(object):
             a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
             a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
             sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
             # current prediction for x_0
             pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
             if quantize_denoised:
                 pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
             # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
             noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
             if noise_dropout > 0.:
                 noise = torch.nn.functional.dropout(noise, p=noise_dropout)

 import torch
 import numpy as np
 from tqdm import tqdm
 from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
 class PLMSSampler(object):
     def __init__(self, model, schedule="linear", **kwargs):
         super().__init__()
         if ddim_eta != 0:
             raise ValueError('ddim_eta must be 0 for PLMS')
         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
         alphas_cumprod = self.model.alphas_cumprod
         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
         # ddim sampling parameters
         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
                                                                                    ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta, verbose=verbose)
         self.register_buffer('ddim_sigmas', ddim_sigmas)
         self.register_buffer('ddim_alphas', ddim_alphas)
         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
     @torch.no_grad()
                log_every_t=100,
                unconditional_guidance_scale=1.,
                unconditional_conditioning=None,
+               features_adapter=None,
+               cond_tau=0.4,
                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
                **kwargs
                ):
                                                     log_every_t=log_every_t,
                                                     unconditional_guidance_scale=unconditional_guidance_scale,
                                                     unconditional_conditioning=unconditional_conditioning,
+                                                    features_adapter=features_adapter,
+                                                    cond_tau=cond_tau
                                                     )
         return samples, intermediates
                       callback=None, timesteps=None, quantize_denoised=False,
                       mask=None, x0=None, img_callback=None, log_every_t=100,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      cond_tau=0.4):
         device = self.model.betas.device
         b = shape[0]
         if x_T is None:
             timesteps = self.ddim_timesteps[:subset_end]
         intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
         print(f"Running PLMS Sampling with {total_steps} timesteps")
             index = total_steps - i - 1
             ts = torch.full((b,), step, device=device, dtype=torch.long)
             ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+            if mask is not None:  # and index>=10:
                 assert x0 is not None
                 img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
                 img = img_orig * mask + (1. - mask) * img
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
                                       quantize_denoised=quantize_denoised, temperature=temperature,
                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
                                       corrector_kwargs=corrector_kwargs,
                                       unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next,
+                                      features_adapter=None if index < int(
+                                          (1 - cond_tau) * total_steps) else features_adapter)
             img, pred_x0, e_t = outs
             old_eps.append(e_t)
     @torch.no_grad()
     def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
+                      features_adapter=None):
         b, *_, device = *x.shape, x.device
         def get_model_output(x, t):
             if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.model.apply_model(x, t, c, features_adapter=features_adapter)
             else:
                 x_in = torch.cat([x] * 2)
                 t_in = torch.cat([t] * 2)
                 c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in, features_adapter=features_adapter).chunk(2)
                 e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
             if score_corrector is not None:
             a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
             a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
             sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
             # current prediction for x_0
             pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
             if quantize_denoised:
                 pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
             # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
             noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
             if noise_dropout > 0.:
                 noise = torch.nn.functional.dropout(noise, p=noise_dropout)

ldm/modules/attention.py CHANGED Viewed

@@ -20,6 +20,10 @@ except:
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def exists(val):
     return val is not None

 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
+if os.environ.get("DISABLE_XFORMERS", "false").lower() == 'true':
+    XFORMERS_IS_AVAILBLE = False
 def exists(val):
     return val is not None

ldm/modules/diffusionmodules/openaimodel.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from abc import abstractmethod
-from functools import partial
 import math
-from typing import Iterable
 import numpy as np
 import torch as th
@@ -18,6 +17,7 @@ from ldm.modules.diffusionmodules.util import (
     timestep_embedding,
 )
 from ldm.modules.attention import SpatialTransformer
 # dummy replace
@@ -270,8 +270,6 @@ class ResBlock(TimestepBlock):
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
-            # print(h.shape, emb_out.shape)
-            # exit(0)
             h = h + emb_out
             h = self.out_layers(h)
         return self.skip_connection(x) + h
@@ -468,16 +466,16 @@ class UNetModel(nn.Module):
         context_dim=None,                 # custom transformer support
         n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
         legacy=True,
-        # l_cond = 4,
     ):
         super().__init__()
-        # print('UNet', context_dim)
         if use_spatial_transformer:
             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
         if context_dim is not None:
-            # print('UNet not none', context_dim, context_dim is not None, context_dim != None, context_dim == "None")
             assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
             from omegaconf.listconfig import ListConfig
             if type(context_dim) == ListConfig:
@@ -496,7 +494,24 @@ class UNetModel(nn.Module):
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
@@ -508,9 +523,6 @@ class UNetModel(nn.Module):
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
         self.predict_codebook_ids = n_embed is not None
-        # self.l_cond = l_cond
-        # print(self.l_cond)
-        # exit(0)
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
@@ -520,7 +532,13 @@ class UNetModel(nn.Module):
         )
         if self.num_classes is not None:
-            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
         self.input_blocks = nn.ModuleList(
             [
@@ -534,7 +552,7 @@ class UNetModel(nn.Module):
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
                 layers = [
                     ResBlock(
                         ch,
@@ -556,17 +574,25 @@ class UNetModel(nn.Module):
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
                         )
-                    )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
@@ -618,8 +644,10 @@ class UNetModel(nn.Module):
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
                         ),
             ResBlock(
                 ch,
@@ -634,7 +662,7 @@ class UNetModel(nn.Module):
         self.output_blocks = nn.ModuleList([])
         for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(num_res_blocks + 1):
                 ich = input_block_chans.pop()
                 layers = [
                     ResBlock(
@@ -657,18 +685,26 @@ class UNetModel(nn.Module):
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads_upsample,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
                         )
-                    )
-                if level and i == num_res_blocks:
                     out_ch = ch
                     layers.append(
                         ResBlock(
@@ -716,7 +752,7 @@ class UNetModel(nn.Module):
         self.middle_block.apply(convert_module_to_f32)
         self.output_blocks.apply(convert_module_to_f32)
-    def forward(self, x, timesteps=None, context=None, y=None, features_adapter=None, step_cur=0,**kwargs):
         """
         Apply the model to an input batch.
         :param x: an [N x C x ...] Tensor of inputs.
@@ -733,21 +769,26 @@ class UNetModel(nn.Module):
         emb = self.time_embed(t_emb)
         if self.num_classes is not None:
-            assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
         h = x.type(self.dtype)
         for id, module in enumerate(self.input_blocks):
             h = module(h, emb, context)
-            if ((id+1)%3 == 0) and features_adapter is not None and len(features_adapter):
-                h = h + features_adapter.pop(0)
             hs.append(h)
         if features_adapter is not None:
-            assert len(features_adapter)==0, 'Wrong features_adapter'
         h = self.middle_block(h, emb, context)
-        for id, module in enumerate(self.output_blocks):
             h = th.cat([h, hs.pop()], dim=1)
             h = module(h, emb, context)
         h = h.type(x.dtype)
@@ -755,222 +796,3 @@ class UNetModel(nn.Module):
             return self.id_predictor(h)
         else:
             return self.out(h)
-class EncoderUNetModel(nn.Module):
-    """
-    The half UNet model with attention and timestep embedding.
-    For usage, see UNet.
-    """
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        pool="adaptive",
-        *args,
-        **kwargs
-    ):
-        super().__init__()
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=num_head_channels,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=num_head_channels,
-                use_new_attention_order=use_new_attention_order,
-            ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-        self.pool = pool
-        if pool == "adaptive":
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                nn.AdaptiveAvgPool2d((1, 1)),
-                zero_module(conv_nd(dims, ch, out_channels, 1)),
-                nn.Flatten(),
-            )
-        elif pool == "attention":
-            assert num_head_channels != -1
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                AttentionPool2d(
-                    (image_size // ds), ch, num_head_channels, out_channels
-                ),
-            )
-        elif pool == "spatial":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                nn.ReLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        elif pool == "spatial_v2":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                normalization(2048),
-                nn.SiLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        else:
-            raise NotImplementedError(f"Unexpected {pool} pooling")
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-    def forward(self, x, timesteps):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
-        """
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
-        results = []
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb)
-            if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
-        h = self.middle_block(h, emb)
-        if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
-            h = th.cat(results, axis=-1)
-            return self.out(h)
-        else:
-            h = h.type(x.dtype)
-            return self.out(h)

 from abc import abstractmethod
 import math
+import torch
 import numpy as np
 import torch as th
     timestep_embedding,
 )
 from ldm.modules.attention import SpatialTransformer
+from ldm.util import exists
 # dummy replace
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
             h = h + emb_out
             h = self.out_layers(h)
         return self.skip_connection(x) + h
         context_dim=None,                 # custom transformer support
         n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
         legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
     ):
         super().__init__()
         if use_spatial_transformer:
             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
         if context_dim is not None:
             assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
             from omegaconf.listconfig import ListConfig
             if type(context_dim) == ListConfig:
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
         self.predict_codebook_ids = n_embed is not None
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
         )
         if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            else:
+                raise ValueError()
         self.input_blocks = nn.ModuleList(
             [
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
                 layers = [
                     ResBlock(
                         ch,
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
                         ),
             ResBlock(
                 ch,
         self.output_blocks = nn.ModuleList([])
         for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
                     ResBlock(
                     if legacy:
                         #num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads_upsample,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
                         )
+                if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
                         ResBlock(
         self.middle_block.apply(convert_module_to_f32)
         self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps=None, context=None, y=None, features_adapter=None, append_to_context=None, **kwargs):
         """
         Apply the model to an input batch.
         :param x: an [N x C x ...] Tensor of inputs.
         emb = self.time_embed(t_emb)
         if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
         h = x.type(self.dtype)
+        if append_to_context is not None:
+            context = torch.cat([context, append_to_context], dim=1)
+        adapter_idx = 0
         for id, module in enumerate(self.input_blocks):
             h = module(h, emb, context)
+            if ((id+1)%3 == 0) and features_adapter is not None:
+                h = h + features_adapter[adapter_idx]
+                adapter_idx += 1
             hs.append(h)
         if features_adapter is not None:
+            assert len(features_adapter)==adapter_idx, 'Wrong features_adapter'
         h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
             h = th.cat([h, hs.pop()], dim=1)
             h = module(h, emb, context)
         h = h.type(x.dtype)
             return self.id_predictor(h)
         else:
             return self.out(h)

ldm/modules/diffusionmodules/util.py CHANGED Viewed

@@ -122,7 +122,9 @@ class CheckpointFunction(torch.autograd.Function):
         ctx.run_function = run_function
         ctx.input_tensors = list(args[:length])
         ctx.input_params = list(args[length:])
         with torch.no_grad():
             output_tensors = ctx.run_function(*ctx.input_tensors)
         return output_tensors
@@ -130,7 +132,8 @@ class CheckpointFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, *output_grads):
         ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
             # Fixes a bug where the first op in run_function modifies the
             # Tensor storage in place, which is not allowed for detach()'d
             # Tensors.

         ctx.run_function = run_function
         ctx.input_tensors = list(args[:length])
         ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                                   "dtype": torch.get_autocast_gpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
         with torch.no_grad():
             output_tensors = ctx.run_function(*ctx.input_tensors)
         return output_tensors
     @staticmethod
     def backward(ctx, *output_grads):
         ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad(), \
+                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
             # Fixes a bug where the first op in run_function modifies the
             # Tensor storage in place, which is not allowed for detach()'d
             # Tensors.

ldm/modules/ema.py CHANGED Viewed

@@ -10,24 +10,28 @@ class LitEma(nn.Module):
         self.m_name2s_name = {}
         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
-                             else torch.tensor(-1,dtype=torch.int))
         for name, p in model.named_parameters():
             if p.requires_grad:
-                #remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.','')
-                self.m_name2s_name.update({name:s_name})
-                self.register_buffer(s_name,p.clone().detach().data)
         self.collected_params = []
-    def forward(self,model):
         decay = self.decay
         if self.num_updates >= 0:
             self.num_updates += 1
-            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
         one_minus_decay = 1.0 - decay

         self.m_name2s_name = {}
         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
+        else torch.tensor(-1, dtype=torch.int))
         for name, p in model.named_parameters():
             if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
         self.collected_params = []
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
+    def forward(self, model):
         decay = self.decay
         if self.num_updates >= 0:
             self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
         one_minus_decay = 1.0 - decay

ldm/modules/encoders/adapter.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from ldm.modules.attention import SpatialTransformer, BasicTransformerBlock
 from collections import OrderedDict
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.
@@ -16,6 +15,7 @@ def conv_nd(dims, *args, **kwargs):
         return nn.Conv3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 def avg_pool_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D average pooling module.
@@ -28,6 +28,7 @@ def avg_pool_nd(dims, *args, **kwargs):
         return nn.AvgPool3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
@@ -37,7 +38,7 @@ class Downsample(nn.Module):
                  downsampling occurs in the inner-two dimensions.
     """
-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -60,15 +61,16 @@ class Downsample(nn.Module):
 class ResnetBlock(nn.Module):
     def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
         super().__init__()
-        ps = ksize//2
-        if in_c != out_c or sk==False:
             self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
             self.in_conv = None
         self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
         self.act = nn.ReLU()
         self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
-        if sk==False:
             self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
             self.skep = None
@@ -80,7 +82,7 @@ class ResnetBlock(nn.Module):
     def forward(self, x):
         if self.down == True:
             x = self.down_opt(x)
-        if self.in_conv is not None: # edit
             x = self.in_conv(x)
         h = self.block1(x)
@@ -101,12 +103,14 @@ class Adapter(nn.Module):
         self.body = []
         for i in range(len(channels)):
             for j in range(nums_rb):
-                if (i!=0) and (j==0):
-                    self.body.append(ResnetBlock(channels[i-1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
                 else:
-                    self.body.append(ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
         self.body = nn.ModuleList(self.body)
-        self.conv_in = nn.Conv2d(cin,channels[0], 3, 1, 1)
     def forward(self, x):
         # unshuffle
@@ -116,12 +120,79 @@ class Adapter(nn.Module):
         x = self.conv_in(x)
         for i in range(len(self.channels)):
             for j in range(self.nums_rb):
-                idx = i*self.nums_rb +j
                 x = self.body[idx](x)
             features.append(x)
         return features
 class ResnetBlock_light(nn.Module):
     def __init__(self, in_c):
@@ -185,66 +256,3 @@ class Adapter_light(nn.Module):
             features.append(x)
         return features
-class QuickGELU(nn.Module):
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-class ResidualAttentionBlock(nn.Module):
-    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = nn.Sequential(
-            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
-                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
-        self.ln_2 = LayerNorm(d_model)
-        self.attn_mask = attn_mask
-    def attention(self, x: torch.Tensor):
-        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
-        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
-    def forward(self, x: torch.Tensor):
-        x = x + self.attention(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-class StyleAdapter(nn.Module):
-    def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4):
-        super().__init__()
-        scale = width ** -0.5
-        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)])
-        self.num_token = num_token
-        self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale)
-        self.ln_post = LayerNorm(width)
-        self.ln_pre = LayerNorm(width)
-        self.proj = nn.Parameter(scale * torch.randn(width, context_dim))
-    def forward(self, x):
-        # x shape [N, HW+1, C]
-        style_embedding = self.style_embedding + torch.zeros(
-            (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device)
-        x = torch.cat([x, style_embedding], dim=1)
-        x = self.ln_pre(x)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer_layes(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.ln_post(x[:, -self.num_token:, :])
-        x = x @ self.proj
-        return x

 import torch
 import torch.nn as nn
 from collections import OrderedDict
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.
         return nn.Conv3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 def avg_pool_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D average pooling module.
         return nn.AvgPool3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
                  downsampling occurs in the inner-two dimensions.
     """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
 class ResnetBlock(nn.Module):
     def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
         super().__init__()
+        ps = ksize // 2
+        if in_c != out_c or sk == False:
             self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
+            # print('n_in')
             self.in_conv = None
         self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
         self.act = nn.ReLU()
         self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+        if sk == False:
             self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
         else:
             self.skep = None
     def forward(self, x):
         if self.down == True:
             x = self.down_opt(x)
+        if self.in_conv is not None:  # edit
             x = self.in_conv(x)
         h = self.block1(x)
         self.body = []
         for i in range(len(channels)):
             for j in range(nums_rb):
+                if (i != 0) and (j == 0):
+                    self.body.append(
+                        ResnetBlock(channels[i - 1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
                 else:
+                    self.body.append(
+                        ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
         self.body = nn.ModuleList(self.body)
+        self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
     def forward(self, x):
         # unshuffle
         x = self.conv_in(x)
         for i in range(len(self.channels)):
             for j in range(self.nums_rb):
+                idx = i * self.nums_rb + j
                 x = self.body[idx](x)
             features.append(x)
         return features
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
+                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class StyleAdapter(nn.Module):
+    def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4):
+        super().__init__()
+        scale = width ** -0.5
+        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)])
+        self.num_token = num_token
+        self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale)
+        self.ln_post = LayerNorm(width)
+        self.ln_pre = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, context_dim))
+    def forward(self, x):
+        # x shape [N, HW+1, C]
+        style_embedding = self.style_embedding + torch.zeros(
+            (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device)
+        x = torch.cat([x, style_embedding], dim=1)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer_layes(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, -self.num_token:, :])
+        x = x @ self.proj
+        return x
 class ResnetBlock_light(nn.Module):
     def __init__(self, in_c):
             features.append(x)
         return features

ldm/modules/encoders/modules.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import torch
 import torch.nn as nn
-from functools import partial
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel
-import kornia
-from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
 class AbstractEncoder(nn.Module):
@@ -17,6 +18,11 @@ class AbstractEncoder(nn.Module):
         raise NotImplementedError
 class ClassEmbedder(nn.Module):
     def __init__(self, embed_dim, n_classes=1000, key='class'):
@@ -33,116 +39,48 @@ class ClassEmbedder(nn.Module):
         return c
-class TransformerEmbedder(AbstractEncoder):
-    """Some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
         super().__init__()
         self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
-    def forward(self, tokens):
-        tokens = tokens.to(self.device)  # meh
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
-    def encode(self, x):
-        return self(x)
-class BERTTokenizer(AbstractEncoder):
-    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
-    def __init__(self, device="cuda", vq_interface=True, max_length=77):
-        super().__init__()
-        from transformers import BertTokenizerFast  # TODO: add to reuquirements
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        self.device = device
-        self.vq_interface = vq_interface
-        self.max_length = max_length
     def forward(self, text):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
-        return tokens
-    @torch.no_grad()
-    def encode(self, text):
-        tokens = self(text)
-        if not self.vq_interface:
-            return tokens
-        return None, None, [None, None, tokens]
-    def decode(self, text):
-        return text
-class BERTEmbedder(AbstractEncoder):
-    """Uses the BERT tokenizr model and add some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
-                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
-        super().__init__()
-        self.use_tknz_fn = use_tokenizer
-        if self.use_tknz_fn:
-            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
-                                              emb_dropout=embedding_dropout)
-    def forward(self, text):
-        if self.use_tknz_fn:
-            tokens = self.tknz_fn(text)#.to(self.device)
-        else:
-            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
         return z
     def encode(self, text):
-        # output of length 77
         return self(text)
-class SpatialRescaler(nn.Module):
-    def __init__(self,
-                 n_stages=1,
-                 method='bilinear',
-                 multiplier=0.5,
-                 in_channels=3,
-                 out_channels=None,
-                 bias=False):
-        super().__init__()
-        self.n_stages = n_stages
-        assert self.n_stages >= 0
-        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
-        self.multiplier = multiplier
-        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
-        self.remap_output = out_channels is not None
-        if self.remap_output:
-            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
-            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
-    def forward(self,x):
-        for stage in range(self.n_stages):
-            x = self.interpolator(x, scale_factor=self.multiplier)
-        if self.remap_output:
-            x = self.channel_mapper(x)
-        return x
-    def encode(self, x):
-        return self(x)
 class FrozenCLIPEmbedder(AbstractEncoder):
-    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
         super().__init__()
         self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        self.transformer = CLIPTextModel.from_pretrained(version)
         self.device = device
         self.max_length = max_length
-        self.freeze()
     def freeze(self):
         self.transformer = self.transformer.eval()
@@ -153,26 +91,47 @@ class FrozenCLIPEmbedder(AbstractEncoder):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens)
-        z = outputs.last_hidden_state
         return z
     def encode(self, text):
         return self(text)
-class FrozenCLIPTextEmbedder(nn.Module):
     """
-    Uses the CLIP transformer encoder for text.
     """
-    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
         super().__init__()
-        self.model, _ = clip.load(version, jit=False, device="cpu")
         self.device = device
         self.max_length = max_length
-        self.n_repeat = n_repeat
-        self.normalize = normalize
     def freeze(self):
         self.model = self.model.eval()
@@ -180,55 +139,303 @@ class FrozenCLIPTextEmbedder(nn.Module):
             param.requires_grad = False
     def forward(self, text):
-        tokens = clip.tokenize(text).to(self.device)
-        z = self.model.encode_text(tokens)
-        if self.normalize:
-            z = z / torch.linalg.norm(z, dim=1, keepdim=True)
         return z
     def encode(self, text):
-        z = self(text)
-        if z.ndim==2:
-            z = z[:, None, :]
-        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
-        return z
-class FrozenClipImageEmbedder(nn.Module):
-    """
-        Uses the CLIP image encoder.
-        """
-    def __init__(
-            self,
-            model,
-            jit=False,
-            device='cuda' if torch.cuda.is_available() else 'cpu',
-            antialias=False,
-        ):
         super().__init__()
-        self.model, _ = clip.load(name=model, device=device, jit=jit)
-        self.antialias = antialias
-        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
-        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
-    def preprocess(self, x):
-        # normalize to [0,1]
-        x = kornia.geometry.resize(x, (224, 224),
-                                   interpolation='bicubic',align_corners=True,
-                                   antialias=self.antialias)
-        x = (x + 1.) / 2.
-        # renormalize according to clip
-        x = kornia.enhance.normalize(x, self.mean, self.std)
-        return x
-    def forward(self, x):
-        # x is assumed to be in range [-1,1]
-        return self.model.encode_image(self.preprocess(x))
 if __name__ == "__main__":
-    from ldm.util import count_params
     model = FrozenCLIPEmbedder()
-    count_params(model, verbose=True)

 import torch
 import torch.nn as nn
+import math
+from torch.utils.checkpoint import checkpoint
+from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel, CLIPModel
+import open_clip
+import re
+from ldm.util import default, count_params
 class AbstractEncoder(nn.Module):
         raise NotImplementedError
+class IdentityEncoder(AbstractEncoder):
+    def encode(self, x):
+        return x
 class ClassEmbedder(nn.Module):
     def __init__(self, embed_dim, n_classes=1000, key='class'):
         return c
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
         super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
         self.device = device
+        self.max_length = max_length   # TODO: typical value?
+        if freeze:
+            self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
     def forward(self, text):
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
         return z
     def encode(self, text):
         return self(text)
 class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
+                 freeze=True, layer="last"):  # clip-vit-base-patch32
         super().__init__()
         self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPModel.from_pretrained(version).text_model
         self.device = device
         self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
     def freeze(self):
         self.transformer = self.transformer.eval()
         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
         tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer != 'last')
+        if self.layer == 'penultimate':
+            z = outputs.hidden_states[-2]
+            z = self.transformer.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
         return z
     def encode(self, text):
         return self(text)
+class FrozenOpenCLIPEmbedder(AbstractEncoder):
     """
+    Uses the OpenCLIP transformer encoder for text
     """
+    LAYERS = [
+        #"pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
         super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
+        del model.visual
+        self.model = model
         self.device = device
         self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
     def freeze(self):
         self.model = self.model.eval()
             param.requires_grad = False
     def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
         return z
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
     def encode(self, text):
+        return self(text)
+class FrozenCLIPT5Encoder(AbstractEncoder):
+    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
+                 clip_max_length=77, t5_max_length=77):
         super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, "
+              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.")
+    def encode(self, text):
+        return self(text)
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
+# code from sd-webui
+re_attention = re.compile(r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""", re.X)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith('\\'):
+            res.append([text[1:], 1.0])
+        elif text == '(':
+            round_brackets.append(len(res))
+        elif text == '[':
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ')' and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == ']' and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+class WebUIFrozenCLIPEmebedder(AbstractEncoder):
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", freeze=True, layer="penultimate"):
+        super(WebUIFrozenCLIPEmebedder, self).__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPModel.from_pretrained(version).text_model
+        self.device = device
+        self.layer = layer
+        if freeze:
+            self.freeze()
+        self.comma_token = [v for k, v in self.tokenizer.get_vocab().items() if k == ',</w>'][0]
+        self.comma_padding_backtrack = 20
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def tokenize(self, texts):
+        tokenized = self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+        return tokenized
+    def encode_with_transformers(self, tokens):
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer!='last')
+        if self.layer == 'penultimate':
+            z = outputs.hidden_states[-2]
+            z = self.transformer.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
+        return z
+    def tokenize_line(self, line):
+        parsed = parse_prompt_attention(line)
+        # print(parsed)
+        tokenized = self.tokenize([text for text, _ in parsed])
+        remade_tokens = []
+        multipliers = []
+        last_comma = -1
+        for tokens, (text, weight) in zip(tokenized, parsed):
+            i = 0
+            while i < len(tokens):
+                token = tokens[i]
+                if token == self.comma_token:
+                    last_comma = len(remade_tokens)
+                elif self.comma_padding_backtrack != 0 and max(len(remade_tokens),
+                                                               1) % 75 == 0 and last_comma != -1 and len(
+                        remade_tokens) - last_comma <= self.comma_padding_backtrack:
+                    last_comma += 1
+                    reloc_tokens = remade_tokens[last_comma:]
+                    reloc_mults = multipliers[last_comma:]
+                    remade_tokens = remade_tokens[:last_comma]
+                    length = len(remade_tokens)
+                    rem = int(math.ceil(length / 75)) * 75 - length
+                    remade_tokens += [self.tokenizer.eos_token_id] * rem + reloc_tokens
+                    multipliers = multipliers[:last_comma] + [1.0] * rem + reloc_mults
+                remade_tokens.append(token)
+                multipliers.append(weight)
+                i += 1
+        token_count = len(remade_tokens)
+        prompt_target_length = math.ceil(max(token_count, 1) / 75) * 75
+        tokens_to_add = prompt_target_length - len(remade_tokens)
+        remade_tokens = remade_tokens + [self.tokenizer.eos_token_id] * tokens_to_add
+        multipliers = multipliers + [1.0] * tokens_to_add
+        return remade_tokens, multipliers, token_count
+    def process_text(self, texts):
+        remade_batch_tokens = []
+        token_count = 0
+        cache = {}
+        batch_multipliers = []
+        for line in texts:
+            if line in cache:
+                remade_tokens, multipliers = cache[line]
+            else:
+                remade_tokens, multipliers, current_token_count = self.tokenize_line(line)
+                token_count = max(current_token_count, token_count)
+                cache[line] = (remade_tokens, multipliers)
+            remade_batch_tokens.append(remade_tokens)
+            batch_multipliers.append(multipliers)
+        return batch_multipliers, remade_batch_tokens, token_count
+    def process_tokens(self, remade_batch_tokens, batch_multipliers):
+        remade_batch_tokens = [[self.tokenizer.bos_token_id] + x[:75] + [self.tokenizer.eos_token_id] for x in remade_batch_tokens]
+        batch_multipliers = [[1.0] + x[:75] + [1.0] for x in batch_multipliers]
+        tokens = torch.asarray(remade_batch_tokens).to(self.device)
+        z = self.encode_with_transformers(tokens)
+        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
+        batch_multipliers_of_same_length = [x + [1.0] * (75 - len(x)) for x in batch_multipliers]
+        batch_multipliers = torch.asarray(batch_multipliers_of_same_length).to(self.device)
+        original_mean = z.mean()
+        z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
+        new_mean = z.mean()
+        z *= original_mean / new_mean
+        return z
+    def forward(self, text):
+        batch_multipliers, remade_batch_tokens, token_count = self.process_text(text)
+        z = None
+        i = 0
+        while max(map(len, remade_batch_tokens)) != 0:
+            rem_tokens = [x[75:] for x in remade_batch_tokens]
+            rem_multipliers = [x[75:] for x in batch_multipliers]
+            tokens = []
+            multipliers = []
+            for j in range(len(remade_batch_tokens)):
+                if len(remade_batch_tokens[j]) > 0:
+                    tokens.append(remade_batch_tokens[j][:75])
+                    multipliers.append(batch_multipliers[j][:75])
+                else:
+                    tokens.append([self.tokenizer.eos_token_id] * 75)
+                    multipliers.append([1.0] * 75)
+            z1 = self.process_tokens(tokens, multipliers)
+            z = z1 if z is None else torch.cat((z, z1), axis=-2)
+            remade_batch_tokens = rem_tokens
+            batch_multipliers = rem_multipliers
+            i += 1
+        return z
+    def encode(self, text):
+        return self(text)
 if __name__ == "__main__":
     model = FrozenCLIPEmbedder()
+    count_params(model, verbose=True)

ldm/modules/{structure_condition → extra_condition}/__init__.py RENAMED Viewed

File without changes

ldm/modules/extra_condition/api.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from enum import Enum, unique
+import cv2
+import torch
+from basicsr.utils import img2tensor
+from ldm.util import resize_numpy_image
+from PIL import Image
+from torch import autocast
+@unique
+class ExtraCondition(Enum):
+    sketch = 0
+    keypose = 1
+    seg = 2
+    depth = 3
+    canny = 4
+    style = 5
+    color = 6
+    openpose = 7
+def get_cond_model(opt, cond_type: ExtraCondition):
+    if cond_type == ExtraCondition.sketch:
+        from ldm.modules.extra_condition.model_edge import pidinet
+        model = pidinet()
+        ckp = torch.load('models/table5_pidinet.pth', map_location='cpu')['state_dict']
+        model.load_state_dict({k.replace('module.', ''): v for k, v in ckp.items()}, strict=True)
+        model.to(opt.device)
+        return model
+    elif cond_type == ExtraCondition.seg:
+        raise NotImplementedError
+    elif cond_type == ExtraCondition.keypose:
+        import mmcv
+        from mmdet.apis import init_detector
+        from mmpose.apis import init_pose_model
+        det_config = 'configs/mm/faster_rcnn_r50_fpn_coco.py'
+        det_checkpoint = 'models/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+        pose_config = 'configs/mm/hrnet_w48_coco_256x192.py'
+        pose_checkpoint = 'models/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
+        det_config_mmcv = mmcv.Config.fromfile(det_config)
+        det_model = init_detector(det_config_mmcv, det_checkpoint, device=opt.device)
+        pose_config_mmcv = mmcv.Config.fromfile(pose_config)
+        pose_model = init_pose_model(pose_config_mmcv, pose_checkpoint, device=opt.device)
+        return {'pose_model': pose_model, 'det_model': det_model}
+    elif cond_type == ExtraCondition.depth:
+        from ldm.modules.extra_condition.midas.api import MiDaSInference
+        model = MiDaSInference(model_type='dpt_hybrid').to(opt.device)
+        return model
+    elif cond_type == ExtraCondition.canny:
+        return None
+    elif cond_type == ExtraCondition.style:
+        from transformers import CLIPProcessor, CLIPVisionModel
+        version = 'openai/clip-vit-large-patch14'
+        processor = CLIPProcessor.from_pretrained(version)
+        clip_vision_model = CLIPVisionModel.from_pretrained(version).to(opt.device)
+        return {'processor': processor, 'clip_vision_model': clip_vision_model}
+    elif cond_type == ExtraCondition.color:
+        return None
+    elif cond_type == ExtraCondition.openpose:
+        from ldm.modules.extra_condition.openpose.api import OpenposeInference
+        model = OpenposeInference().to(opt.device)
+        return model
+    else:
+        raise NotImplementedError
+def get_cond_sketch(opt, cond_image, cond_inp_type, cond_model=None):
+    if isinstance(cond_image, str):
+        edge = cv2.imread(cond_image)
+    else:
+        # for gradio input, pay attention, it's rgb numpy
+        edge = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    edge = resize_numpy_image(edge, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = edge.shape[:2]
+    if cond_inp_type == 'sketch':
+        edge = img2tensor(edge)[0].unsqueeze(0).unsqueeze(0) / 255.
+        edge = edge.to(opt.device)
+    elif cond_inp_type == 'image':
+        edge = img2tensor(edge).unsqueeze(0) / 255.
+        edge = cond_model(edge.to(opt.device))[-1]
+    else:
+        raise NotImplementedError
+    # edge = 1-edge # for white background
+    edge = edge > 0.5
+    edge = edge.float()
+    return edge
+def get_cond_seg(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        seg = cv2.imread(cond_image)
+    else:
+        seg = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    seg = resize_numpy_image(seg, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = seg.shape[:2]
+    if cond_inp_type == 'seg':
+        seg = img2tensor(seg).unsqueeze(0) / 255.
+        seg = seg.to(opt.device)
+    else:
+        raise NotImplementedError
+    return seg
+def get_cond_keypose(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        pose = cv2.imread(cond_image)
+    else:
+        pose = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    pose = resize_numpy_image(pose, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = pose.shape[:2]
+    if cond_inp_type == 'keypose':
+        pose = img2tensor(pose).unsqueeze(0) / 255.
+        pose = pose.to(opt.device)
+    elif cond_inp_type == 'image':
+        from ldm.modules.extra_condition.utils import imshow_keypoints
+        from mmdet.apis import inference_detector
+        from mmpose.apis import (inference_top_down_pose_model, process_mmdet_results)
+        # mmpose seems not compatible with autocast fp16
+        with autocast("cuda", dtype=torch.float32):
+            mmdet_results = inference_detector(cond_model['det_model'], pose)
+            # keep the person class bounding boxes.
+            person_results = process_mmdet_results(mmdet_results, 1)
+            # optional
+            return_heatmap = False
+            dataset = cond_model['pose_model'].cfg.data['test']['type']
+            # e.g. use ('backbone', ) to return backbone feature
+            output_layer_names = None
+            pose_results, returned_outputs = inference_top_down_pose_model(
+                cond_model['pose_model'],
+                pose,
+                person_results,
+                bbox_thr=0.2,
+                format='xyxy',
+                dataset=dataset,
+                dataset_info=None,
+                return_heatmap=return_heatmap,
+                outputs=output_layer_names)
+        # show the results
+        pose = imshow_keypoints(pose, pose_results, radius=2, thickness=2)
+        pose = img2tensor(pose).unsqueeze(0) / 255.
+        pose = pose.to(opt.device)
+    else:
+        raise NotImplementedError
+    return pose
+def get_cond_depth(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        depth = cv2.imread(cond_image)
+    else:
+        depth = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    depth = resize_numpy_image(depth, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = depth.shape[:2]
+    if cond_inp_type == 'depth':
+        depth = img2tensor(depth).unsqueeze(0) / 255.
+        depth = depth.to(opt.device)
+    elif cond_inp_type == 'image':
+        depth = img2tensor(depth).unsqueeze(0) / 127.5 - 1.0
+        depth = cond_model(depth.to(opt.device)).repeat(1, 3, 1, 1)
+        depth -= torch.min(depth)
+        depth /= torch.max(depth)
+    else:
+        raise NotImplementedError
+    return depth
+def get_cond_canny(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        canny = cv2.imread(cond_image)
+    else:
+        canny = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    canny = resize_numpy_image(canny, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = canny.shape[:2]
+    if cond_inp_type == 'canny':
+        canny = img2tensor(canny)[0:1].unsqueeze(0) / 255.
+        canny = canny.to(opt.device)
+    elif cond_inp_type == 'image':
+        canny = cv2.Canny(canny, 100, 200)[..., None]
+        canny = img2tensor(canny).unsqueeze(0) / 255.
+        canny = canny.to(opt.device)
+    else:
+        raise NotImplementedError
+    return canny
+def get_cond_style(opt, cond_image, cond_inp_type='image', cond_model=None):
+    assert cond_inp_type == 'image'
+    if isinstance(cond_image, str):
+        style = Image.open(cond_image)
+    else:
+        # numpy image to PIL image
+        style = Image.fromarray(cond_image)
+    style_for_clip = cond_model['processor'](images=style, return_tensors="pt")['pixel_values']
+    style_feat = cond_model['clip_vision_model'](style_for_clip.to(opt.device))['last_hidden_state']
+    return style_feat
+def get_cond_color(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        color = cv2.imread(cond_image)
+    else:
+        color = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    color = resize_numpy_image(color, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = color.shape[:2]
+    if cond_inp_type == 'image':
+        color = cv2.resize(color, (opt.W//64, opt.H//64), interpolation=cv2.INTER_CUBIC)
+        color = cv2.resize(color, (opt.W, opt.H), interpolation=cv2.INTER_NEAREST)
+    color = img2tensor(color).unsqueeze(0) / 255.
+    color = color.to(opt.device)
+    return color
+def get_cond_openpose(opt, cond_image, cond_inp_type='image', cond_model=None):
+    if isinstance(cond_image, str):
+        openpose_keypose = cv2.imread(cond_image)
+    else:
+        openpose_keypose = cv2.cvtColor(cond_image, cv2.COLOR_RGB2BGR)
+    openpose_keypose = resize_numpy_image(
+        openpose_keypose, max_resolution=opt.max_resolution, resize_short_edge=opt.resize_short_edge)
+    opt.H, opt.W = openpose_keypose.shape[:2]
+    if cond_inp_type == 'openpose':
+        openpose_keypose = img2tensor(openpose_keypose).unsqueeze(0) / 255.
+        openpose_keypose = openpose_keypose.to(opt.device)
+    elif cond_inp_type == 'image':
+        with autocast('cuda', dtype=torch.float32):
+            openpose_keypose = cond_model(openpose_keypose)
+        openpose_keypose = img2tensor(openpose_keypose).unsqueeze(0) / 255.
+        openpose_keypose = openpose_keypose.to(opt.device)
+    else:
+        raise NotImplementedError
+    return openpose_keypose
+def get_adapter_feature(inputs, adapters):
+    ret_feat_map = None
+    ret_feat_seq = None
+    if not isinstance(inputs, list):
+        inputs = [inputs]
+        adapters = [adapters]
+    for input, adapter in zip(inputs, adapters):
+        cur_feature = adapter['model'](input)
+        if isinstance(cur_feature, list):
+            if ret_feat_map is None:
+                ret_feat_map = list(map(lambda x: x * adapter['cond_weight'], cur_feature))
+            else:
+                ret_feat_map = list(map(lambda x, y: x + y * adapter['cond_weight'], ret_feat_map, cur_feature))
+        else:
+            if ret_feat_seq is None:
+                ret_feat_seq = cur_feature
+            else:
+                ret_feat_seq = torch.cat([ret_feat_seq, cur_feature], dim=1)
+    return ret_feat_map, ret_feat_seq

ldm/modules/{structure_condition/midas → extra_condition}/midas/__init__.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/api.py RENAMED Viewed

@@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 from torchvision.transforms import Compose
-from ldm.modules.structure_condition.midas.midas.dpt_depth import DPTDepthModel
-from ldm.modules.structure_condition.midas.midas.midas_net import MidasNet
-from ldm.modules.structure_condition.midas.midas.midas_net_custom import MidasNet_small
-from ldm.modules.structure_condition.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
 ISL_PATHS = {

 import torch.nn as nn
 from torchvision.transforms import Compose
+from ldm.modules.extra_condition.midas.midas.dpt_depth import DPTDepthModel
+from ldm.modules.extra_condition.midas.midas.midas_net import MidasNet
+from ldm.modules.extra_condition.midas.midas.midas_net_custom import MidasNet_small
+from ldm.modules.extra_condition.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
 ISL_PATHS = {

ldm/modules/{structure_condition/openpose → extra_condition/midas/midas}/__init__.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/base_model.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/blocks.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/dpt_depth.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/midas_net.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/midas_net_custom.py RENAMED Viewed

File without changes

ldm/modules/{structure_condition → extra_condition}/midas/midas/transforms.py RENAMED Viewed

File without changes